Home | History | Annotate | Download | only in MagickCore
      1 /*
      2   Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization
      3   dedicated to making software imaging solutions freely available.
      4 
      5   You may not use this file except in compliance with the License.
      6   obtain a copy of the License at
      7 
      8     http://www.imagemagick.org/script/license.php
      9 
     10   Unless required by applicable law or agreed to in writing, software
     11   distributed under the License is distributed on an "AS IS" BASIS,
     12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13   See the License for the specific language governing permissions and
     14   limitations under the License.
     15 
     16   MagickCore private kernels for accelerated functions.
     17 */
     18 
     19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
     20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
     21 
     22 #if defined(__cplusplus) || defined(c_plusplus)
     23 extern "C" {
     24 #endif
     25 
     26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
     27 
     28 /*
     29   Define declarations.
     30 */
     31 #define OPENCL_DEFINE(VAR,...)	"\n #""define " #VAR " " #__VA_ARGS__ " \n"
     32 #define OPENCL_ELIF(...)	"\n #""elif " #__VA_ARGS__ " \n"
     33 #define OPENCL_ELSE()		"\n #""else " " \n"
     34 #define OPENCL_ENDIF()		"\n #""endif " " \n"
     35 #define OPENCL_IF(...)		"\n #""if " #__VA_ARGS__ " \n"
     36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
     37 
     38 /*
     39   Typedef declarations.
     40 */
     41 
     42 typedef struct _FloatPixelPacket
     43 {
     44   MagickRealType
     45     red,
     46     green,
     47     blue,
     48     alpha,
     49     black;
     50 } FloatPixelPacket;
     51 
     52 const char *accelerateKernels =
     53 
     54 /*
     55   Define declarations.
     56 */
     57   OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
     58   OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
     59   OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
     60   OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
     61   OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
     62   OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
     63   OPENCL_DEFINE(SigmaRandom, (attenuate))
     64   OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
     65   OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
     66   OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
     67 
     68 /*
     69   Typedef declarations.
     70 */
     71   STRINGIFY(
     72     typedef enum
     73   {
     74     UndefinedColorspace,
     75     RGBColorspace,            /* Linear RGB colorspace */
     76     GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
     77     TransparentColorspace,
     78     OHTAColorspace,
     79     LabColorspace,
     80     XYZColorspace,
     81     YCbCrColorspace,
     82     YCCColorspace,
     83     YIQColorspace,
     84     YPbPrColorspace,
     85     YUVColorspace,
     86     CMYKColorspace,           /* negared linear RGB with black separated */
     87     sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
     88     HSBColorspace,
     89     HSLColorspace,
     90     HWBColorspace,
     91     Rec601LumaColorspace,
     92     Rec601YCbCrColorspace,
     93     Rec709LumaColorspace,
     94     Rec709YCbCrColorspace,
     95     LogColorspace,
     96     CMYColorspace,            /* negated linear RGB colorspace */
     97     LuvColorspace,
     98     HCLColorspace,
     99     LCHColorspace,            /* alias for LCHuv */
    100     LMSColorspace,
    101     LCHabColorspace,          /* Cylindrical (Polar) Lab */
    102     LCHuvColorspace,          /* Cylindrical (Polar) Luv */
    103     scRGBColorspace,
    104     HSIColorspace,
    105     HSVColorspace,            /* alias for HSB */
    106     HCLpColorspace,
    107     YDbDrColorspace
    108   } ColorspaceType;
    109   )
    110 
    111   STRINGIFY(
    112     typedef enum
    113     {
    114       UndefinedCompositeOp,
    115       NoCompositeOp,
    116       ModulusAddCompositeOp,
    117       AtopCompositeOp,
    118       BlendCompositeOp,
    119       BumpmapCompositeOp,
    120       ChangeMaskCompositeOp,
    121       ClearCompositeOp,
    122       ColorBurnCompositeOp,
    123       ColorDodgeCompositeOp,
    124       ColorizeCompositeOp,
    125       CopyBlackCompositeOp,
    126       CopyBlueCompositeOp,
    127       CopyCompositeOp,
    128       CopyCyanCompositeOp,
    129       CopyGreenCompositeOp,
    130       CopyMagentaCompositeOp,
    131       CopyOpacityCompositeOp,
    132       CopyRedCompositeOp,
    133       CopyYellowCompositeOp,
    134       DarkenCompositeOp,
    135       DstAtopCompositeOp,
    136       DstCompositeOp,
    137       DstInCompositeOp,
    138       DstOutCompositeOp,
    139       DstOverCompositeOp,
    140       DifferenceCompositeOp,
    141       DisplaceCompositeOp,
    142       DissolveCompositeOp,
    143       ExclusionCompositeOp,
    144       HardLightCompositeOp,
    145       HueCompositeOp,
    146       InCompositeOp,
    147       LightenCompositeOp,
    148       LinearLightCompositeOp,
    149       LuminizeCompositeOp,
    150       MinusDstCompositeOp,
    151       ModulateCompositeOp,
    152       MultiplyCompositeOp,
    153       OutCompositeOp,
    154       OverCompositeOp,
    155       OverlayCompositeOp,
    156       PlusCompositeOp,
    157       ReplaceCompositeOp,
    158       SaturateCompositeOp,
    159       ScreenCompositeOp,
    160       SoftLightCompositeOp,
    161       SrcAtopCompositeOp,
    162       SrcCompositeOp,
    163       SrcInCompositeOp,
    164       SrcOutCompositeOp,
    165       SrcOverCompositeOp,
    166       ModulusSubtractCompositeOp,
    167       ThresholdCompositeOp,
    168       XorCompositeOp,
    169       /* These are new operators, added after the above was last sorted.
    170        * The list should be re-sorted only when a new library version is
    171        * created.
    172        */
    173       DivideDstCompositeOp,
    174       DistortCompositeOp,
    175       BlurCompositeOp,
    176       PegtopLightCompositeOp,
    177       VividLightCompositeOp,
    178       PinLightCompositeOp,
    179       LinearDodgeCompositeOp,
    180       LinearBurnCompositeOp,
    181       MathematicsCompositeOp,
    182       DivideSrcCompositeOp,
    183       MinusSrcCompositeOp,
    184       DarkenIntensityCompositeOp,
    185       LightenIntensityCompositeOp
    186     } CompositeOperator;
    187   )
    188 
    189   STRINGIFY(
    190      typedef enum
    191      {
    192        UndefinedFunction,
    193        ArcsinFunction,
    194        ArctanFunction,
    195        PolynomialFunction,
    196        SinusoidFunction
    197      } MagickFunction;
    198   )
    199 
    200   STRINGIFY(
    201     typedef enum
    202     {
    203       UndefinedNoise,
    204       UniformNoise,
    205       GaussianNoise,
    206       MultiplicativeGaussianNoise,
    207       ImpulseNoise,
    208       LaplacianNoise,
    209       PoissonNoise,
    210       RandomNoise
    211     } NoiseType;
    212   )
    213 
    214   STRINGIFY(
    215     typedef enum
    216   {
    217     UndefinedPixelIntensityMethod = 0,
    218     AveragePixelIntensityMethod,
    219     BrightnessPixelIntensityMethod,
    220     LightnessPixelIntensityMethod,
    221     Rec601LumaPixelIntensityMethod,
    222     Rec601LuminancePixelIntensityMethod,
    223     Rec709LumaPixelIntensityMethod,
    224     Rec709LuminancePixelIntensityMethod,
    225     RMSPixelIntensityMethod,
    226     MSPixelIntensityMethod
    227   } PixelIntensityMethod;
    228   )
    229 
    230   STRINGIFY(
    231   typedef enum {
    232     BoxWeightingFunction = 0,
    233     TriangleWeightingFunction,
    234     CubicBCWeightingFunction,
    235     HannWeightingFunction,
    236     HammingWeightingFunction,
    237     BlackmanWeightingFunction,
    238     GaussianWeightingFunction,
    239     QuadraticWeightingFunction,
    240     JincWeightingFunction,
    241     SincWeightingFunction,
    242     SincFastWeightingFunction,
    243     KaiserWeightingFunction,
    244     WelshWeightingFunction,
    245     BohmanWeightingFunction,
    246     LagrangeWeightingFunction,
    247     CosineWeightingFunction,
    248   } ResizeWeightingFunctionType;
    249   )
    250 
    251   STRINGIFY(
    252      typedef enum
    253      {
    254        UndefinedChannel = 0x0000,
    255        RedChannel = 0x0001,
    256        GrayChannel = 0x0001,
    257        CyanChannel = 0x0001,
    258        GreenChannel = 0x0002,
    259        MagentaChannel = 0x0002,
    260        BlueChannel = 0x0004,
    261        YellowChannel = 0x0004,
    262        BlackChannel = 0x0008,
    263        AlphaChannel = 0x0010,
    264        OpacityChannel = 0x0010,
    265        IndexChannel = 0x0020,             /* Color Index Table? */
    266        ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
    267        WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
    268        MetaChannel = 0x0100,              /* ???? */
    269        CompositeChannels = 0x001F,
    270        AllChannels = 0x7ffffff,
    271        /*
    272          Special purpose channel types.
    273          FUTURE: are these needed any more - they are more like hacks
    274          SyncChannels for example is NOT a real channel but a 'flag'
    275          It really says -- "User has not defined channels"
    276          Though it does have extra meaning in the "-auto-level" operator
    277        */
    278        TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
    279        RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
    280        GrayChannels = 0x0400,
    281        SyncChannels = 0x20000,    /* channels modified as a single unit */
    282        DefaultChannels = AllChannels
    283      } ChannelType;  /* must correspond to PixelChannel */
    284   )
    285 
    286 /*
    287   Helper functions.
    288 */
    289 
    290 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
    291 
    292   STRINGIFY(
    293     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
    294     {
    295       return((CLQuantum) value);
    296     }
    297   )
    298 
    299 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
    300 
    301   STRINGIFY(
    302     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
    303     {
    304       return((CLQuantum) (257.0f*value));
    305     }
    306   )
    307 
    308 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
    309 
    310   STRINGIFY(
    311     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
    312     {
    313       return((CLQuantum) (16843009.0*value));
    314     }
    315   )
    316 
    317 OPENCL_ENDIF()
    318 
    319   STRINGIFY(
    320     inline int ClampToCanvas(const int offset,const int range)
    321       {
    322         return clamp(offset, (int)0, range-1);
    323       }
    324   )
    325 
    326   STRINGIFY(
    327     inline CLQuantum ClampToQuantum(const float value)
    328       {
    329         return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
    330       }
    331   )
    332 
    333   STRINGIFY(
    334     inline uint ScaleQuantumToMap(CLQuantum value)
    335       {
    336         if (value >= (CLQuantum) MaxMap)
    337           return ((uint)MaxMap);
    338         else
    339           return ((uint)value);
    340       }
    341   )
    342 
    343   STRINGIFY(
    344     inline float PerceptibleReciprocal(const float x)
    345     {
    346       float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
    347       return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
    348     }
    349   )
    350 
    351   STRINGIFY(
    352   inline float RoundToUnity(const float value)
    353    {
    354      return clamp(value,0.0f,1.0f);
    355    }
    356   )
    357 
    358   STRINGIFY(
    359 
    360   inline unsigned int getPixelIndex(const unsigned int number_channels,
    361     const unsigned int columns, const unsigned int x, const unsigned int y)
    362   {
    363     return (x * number_channels) + (y * columns * number_channels);
    364   }
    365 
    366   inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
    367   inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
    368   inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
    369   inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
    370 
    371   inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
    372   inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
    373   inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
    374   inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
    375 
    376   inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
    377   inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
    378   inline float getBlueF4(float4 p)                      { return p.x; }
    379   inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
    380 
    381   inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
    382   inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
    383   inline float getGreenF4(float4 p)                     { return p.y; }
    384   inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
    385 
    386   inline CLQuantum getRed(CLPixelType p)                { return p.z; }
    387   inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
    388   inline float getRedF4(float4 p)                       { return p.z; }
    389   inline void setRedF4(float4* p, float value)          { (*p).z = value; }
    390 
    391   inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
    392   inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
    393   inline float getAlphaF4(float4 p)                     { return p.w; }
    394   inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
    395 
    396   inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
    397     const ChannelType channel, float *red, float *green, float *blue, float *alpha)
    398   {
    399     if ((channel & RedChannel) != 0)
    400       *red=getPixelRed(p);
    401 
    402     if (number_channels > 2)
    403       {
    404         if ((channel & GreenChannel) != 0)
    405           *green=getPixelGreen(p);
    406 
    407         if ((channel & BlueChannel) != 0)
    408           *blue=getPixelBlue(p);
    409       }
    410 
    411     if (((number_channels == 4) || (number_channels == 2)) &&
    412         ((channel & AlphaChannel) != 0))
    413       *alpha=getPixelAlpha(p,number_channels);
    414   }
    415 
    416   inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
    417     const unsigned int columns, const unsigned int x, const unsigned int y)
    418   {
    419     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
    420 
    421     float4 pixel;
    422 
    423     pixel.x=getPixelRed(p);
    424 
    425     if (number_channels > 2)
    426       {
    427         pixel.y=getPixelGreen(p);
    428         pixel.z=getPixelBlue(p);
    429       }
    430 
    431     if ((number_channels == 4) || (number_channels == 2))
    432       pixel.w=getPixelAlpha(p,number_channels);
    433     return(pixel);
    434   }
    435 
    436   inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
    437     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
    438   {
    439     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
    440 
    441     float red = 0.0f;
    442     float green = 0.0f;
    443     float blue = 0.0f;
    444     float alpha = 0.0f;
    445 
    446     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
    447     return (float4)(red, green, blue, alpha);
    448   }
    449 
    450   inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
    451     const ChannelType channel, float red, float green, float blue, float alpha)
    452   {
    453     if ((channel & RedChannel) != 0)
    454       setPixelRed(p,ClampToQuantum(red));
    455 
    456     if (number_channels > 2)
    457       {
    458         if ((channel & GreenChannel) != 0)
    459           setPixelGreen(p,ClampToQuantum(green));
    460 
    461         if ((channel & BlueChannel) != 0)
    462           setPixelBlue(p,ClampToQuantum(blue));
    463       }
    464 
    465     if (((number_channels == 4) || (number_channels == 2)) &&
    466         ((channel & AlphaChannel) != 0))
    467       setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
    468   }
    469 
    470   inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
    471     const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
    472   {
    473     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
    474 
    475     setPixelRed(p,ClampToQuantum(pixel.x));
    476 
    477     if (number_channels > 2)
    478       {
    479         setPixelGreen(p,ClampToQuantum(pixel.y));
    480         setPixelBlue(p,ClampToQuantum(pixel.z));
    481       }
    482 
    483     if ((number_channels == 4) || (number_channels == 2))
    484       setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
    485   }
    486 
    487   inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
    488     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
    489     float4 pixel)
    490   {
    491     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
    492     WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
    493   }
    494 
    495   inline float GetPixelIntensity(const unsigned int colorspace,
    496     const unsigned int method,float red,float green,float blue)
    497   {
    498     float intensity;
    499 
    500     if (colorspace == GRAYColorspace)
    501       return red;
    502 
    503     switch (method)
    504     {
    505       case AveragePixelIntensityMethod:
    506         {
    507           intensity=(red+green+blue)/3.0;
    508           break;
    509         }
    510       case BrightnessPixelIntensityMethod:
    511         {
    512           intensity=MagickMax(MagickMax(red,green),blue);
    513           break;
    514         }
    515       case LightnessPixelIntensityMethod:
    516         {
    517           intensity=(MagickMin(MagickMin(red,green),blue)+
    518               MagickMax(MagickMax(red,green),blue))/2.0;
    519           break;
    520         }
    521       case MSPixelIntensityMethod:
    522         {
    523           intensity=(float) (((float) red*red+green*green+blue*blue)/
    524               (3.0*QuantumRange));
    525           break;
    526         }
    527       case Rec601LumaPixelIntensityMethod:
    528         {
    529           /*
    530           if (image->colorspace == RGBColorspace)
    531           {
    532             red=EncodePixelGamma(red);
    533             green=EncodePixelGamma(green);
    534             blue=EncodePixelGamma(blue);
    535           }
    536           */
    537           intensity=0.298839*red+0.586811*green+0.114350*blue;
    538           break;
    539         }
    540       case Rec601LuminancePixelIntensityMethod:
    541         {
    542           /*
    543           if (image->colorspace == sRGBColorspace)
    544           {
    545             red=DecodePixelGamma(red);
    546             green=DecodePixelGamma(green);
    547             blue=DecodePixelGamma(blue);
    548           }
    549           */
    550           intensity=0.298839*red+0.586811*green+0.114350*blue;
    551           break;
    552         }
    553       case Rec709LumaPixelIntensityMethod:
    554       default:
    555         {
    556           /*
    557           if (image->colorspace == RGBColorspace)
    558           {
    559             red=EncodePixelGamma(red);
    560             green=EncodePixelGamma(green);
    561             blue=EncodePixelGamma(blue);
    562           }
    563           */
    564           intensity=0.212656*red+0.715158*green+0.072186*blue;
    565           break;
    566         }
    567       case Rec709LuminancePixelIntensityMethod:
    568         {
    569           /*
    570           if (image->colorspace == sRGBColorspace)
    571           {
    572             red=DecodePixelGamma(red);
    573             green=DecodePixelGamma(green);
    574             blue=DecodePixelGamma(blue);
    575           }
    576           */
    577           intensity=0.212656*red+0.715158*green+0.072186*blue;
    578           break;
    579         }
    580       case RMSPixelIntensityMethod:
    581         {
    582           intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
    583               sqrt(3.0));
    584           break;
    585         }
    586     }
    587 
    588     return intensity;
    589   }
    590 
    591   inline int mirrorBottom(int value)
    592   {
    593       return (value < 0) ? - (value) : value;
    594   }
    595 
    596   inline int mirrorTop(int value, int width)
    597   {
    598       return (value >= width) ? (2 * width - value - 1) : value;
    599   }
    600   )
    601 
    602 /*
    603 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    604 %                                                                             %
    605 %                                                                             %
    606 %                                                                             %
    607 %     A d d N o i s e                                                         %
    608 %                                                                             %
    609 %                                                                             %
    610 %                                                                             %
    611 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    612 */
    613 
    614   STRINGIFY(
    615   /*
    616   Part of MWC64X by David Thomas, dt10 (at) imperial.ac.uk
    617   This is provided under BSD, full license is with the main package.
    618   See http://www.doc.ic.ac.uk/~dt10/research
    619   */
    620 
    621   // Pre: a<M, b<M
    622   // Post: r=(a+b) mod M
    623   ulong MWC_AddMod64(ulong a, ulong b, ulong M)
    624   {
    625     ulong v=a+b;
    626     //if( (v>=M) || (v<a) )
    627     if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
    628       v=v-M;
    629     return v;
    630   }
    631 
    632   // Pre: a<M,b<M
    633   // Post: r=(a*b) mod M
    634   // This could be done more efficently, but it is portable, and should
    635   // be easy to understand. It can be replaced with any of the better
    636   // modular multiplication algorithms (for example if you know you have
    637   // double precision available or something).
    638   ulong MWC_MulMod64(ulong a, ulong b, ulong M)
    639   {
    640     ulong r=0;
    641     while(a!=0){
    642       if(a&1)
    643         r=MWC_AddMod64(r,b,M);
    644       b=MWC_AddMod64(b,b,M);
    645       a=a>>1;
    646     }
    647     return r;
    648   }
    649 
    650   // Pre: a<M, e>=0
    651   // Post: r=(a^b) mod M
    652   // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
    653   // most architectures
    654   ulong MWC_PowMod64(ulong a, ulong e, ulong M)
    655   {
    656     ulong sqr=a, acc=1;
    657     while(e!=0){
    658       if(e&1)
    659         acc=MWC_MulMod64(acc,sqr,M);
    660         sqr=MWC_MulMod64(sqr,sqr,M);
    661       e=e>>1;
    662     }
    663     return acc;
    664   }
    665 
    666   uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
    667   {
    668     ulong m=MWC_PowMod64(A, distance, M);
    669     ulong x=curr.x*(ulong)A+curr.y;
    670     x=MWC_MulMod64(x, m, M);
    671     return (uint2)((uint)(x/A), (uint)(x%A));
    672   }
    673 
    674   uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
    675   {
    676     // This is an arbitrary constant for starting LCG jumping from. I didn't
    677     // want to start from 1, as then you end up with the two or three first values
    678     // being a bit poor in ones - once you've decided that, one constant is as
    679     // good as any another. There is no deep mathematical reason for it, I just
    680     // generated a random number.
    681     enum{ MWC_BASEID = 4077358422479273989UL };
    682 
    683     ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
    684     ulong m=MWC_PowMod64(A, dist, M);
    685 
    686     ulong x=MWC_MulMod64(MWC_BASEID, m, M);
    687     return (uint2)((uint)(x/A), (uint)(x%A));
    688   }
    689 
    690   //! Represents the state of a particular generator
    691   typedef struct{ uint x; uint c; } mwc64x_state_t;
    692 
    693   enum{ MWC64X_A = 4294883355U };
    694   enum{ MWC64X_M = 18446383549859758079UL };
    695 
    696   void MWC64X_Step(mwc64x_state_t *s)
    697   {
    698     uint X=s->x, C=s->c;
    699 
    700     uint Xn=MWC64X_A*X+C;
    701     uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
    702     uint Cn=mad_hi(MWC64X_A,X,carry);
    703 
    704     s->x=Xn;
    705     s->c=Cn;
    706   }
    707 
    708   void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
    709   {
    710     uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
    711     s->x=tmp.x;
    712     s->c=tmp.y;
    713   }
    714 
    715   void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
    716   {
    717     uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
    718     s->x=tmp.x;
    719     s->c=tmp.y;
    720   }
    721 
    722   //! Return a 32-bit integer in the range [0..2^32)
    723   uint MWC64X_NextUint(mwc64x_state_t *s)
    724   {
    725     uint res=s->x ^ s->c;
    726     MWC64X_Step(s);
    727     return res;
    728   }
    729 
    730   //
    731   // End of MWC64X excerpt
    732   //
    733 
    734   float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
    735   {
    736     return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
    737   }
    738 
    739   float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
    740   {
    741     float
    742       alpha,
    743       beta,
    744       noise,
    745       sigma;
    746 
    747     noise = 0.0f;
    748     alpha=mwcReadPseudoRandomValue(r);
    749     switch(noise_type)
    750     {
    751       case UniformNoise:
    752       default:
    753         {
    754           noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
    755           break;
    756         }
    757       case GaussianNoise:
    758         {
    759           float
    760             gamma,
    761             tau;
    762 
    763           if (alpha == 0.0f)
    764             alpha=1.0f;
    765           beta=mwcReadPseudoRandomValue(r);
    766           gamma=sqrt(-2.0f*log(alpha));
    767           sigma=gamma*cospi((2.0f*beta));
    768           tau=gamma*sinpi((2.0f*beta));
    769           noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
    770           break;
    771         }
    772       case ImpulseNoise:
    773       {
    774         if (alpha < (SigmaImpulse/2.0f))
    775           noise=0.0f;
    776         else
    777           if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
    778             noise=QuantumRange;
    779           else
    780             noise=pixel;
    781         break;
    782       }
    783       case LaplacianNoise:
    784       {
    785         if (alpha <= 0.5f)
    786           {
    787             if (alpha <= MagickEpsilon)
    788               noise=(pixel-QuantumRange);
    789             else
    790               noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
    791                 0.5f);
    792             break;
    793           }
    794         beta=1.0f-alpha;
    795         if (beta <= (0.5f*MagickEpsilon))
    796           noise=(pixel+QuantumRange);
    797         else
    798           noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
    799         break;
    800       }
    801       case MultiplicativeGaussianNoise:
    802       {
    803         sigma=1.0f;
    804         if (alpha > MagickEpsilon)
    805           sigma=sqrt(-2.0f*log(alpha));
    806         beta=mwcReadPseudoRandomValue(r);
    807         noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
    808           cospi((2.0f*beta))/2.0f);
    809         break;
    810       }
    811       case PoissonNoise:
    812       {
    813         float
    814           poisson;
    815         unsigned int i;
    816         poisson=exp(-SigmaPoisson*QuantumScale*pixel);
    817         for (i=0; alpha > poisson; i++)
    818         {
    819           beta=mwcReadPseudoRandomValue(r);
    820           alpha*=beta;
    821         }
    822         noise=(QuantumRange*i/SigmaPoisson);
    823         break;
    824       }
    825       case RandomNoise:
    826       {
    827         noise=(QuantumRange*SigmaRandom*alpha);
    828         break;
    829       }
    830     }
    831     return noise;
    832   }
    833 
    834   __kernel
    835   void AddNoise(const __global CLQuantum *image,
    836     const unsigned int number_channels,const ChannelType channel,
    837     const unsigned int length,const unsigned int pixelsPerWorkItem,
    838     const NoiseType noise_type,const float attenuate,const unsigned int seed0,
    839     const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
    840     __global CLQuantum *filteredImage)
    841   {
    842     mwc64x_state_t rng;
    843     rng.x = seed0;
    844     rng.c = seed1;
    845 
    846     uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
    847     uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
    848     MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
    849 
    850     uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
    851     uint count = pixelsPerWorkItem;
    852 
    853     while (count > 0 && pos < length)
    854     {
    855       const __global CLQuantum *p = image + pos;
    856       __global CLQuantum *q = filteredImage + pos;
    857 
    858       float red;
    859       float green;
    860       float blue;
    861       float alpha;
    862 
    863       ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
    864 
    865       if ((channel & RedChannel) != 0)
    866         red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
    867 
    868       if (number_channels > 2)
    869       {
    870         if ((channel & GreenChannel) != 0)
    871           green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
    872 
    873         if ((channel & BlueChannel) != 0)
    874           blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
    875       }
    876 
    877       if (((number_channels == 4) || (number_channels == 2)) &&
    878           ((channel & AlphaChannel) != 0))
    879         alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
    880 
    881       WriteChannels(q, number_channels, channel, red, green, blue, alpha);
    882 
    883       pos += (get_local_size(0) * number_channels);
    884       count--;
    885     }
    886   }
    887   )
    888 
    889 /*
    890 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    891 %                                                                             %
    892 %                                                                             %
    893 %                                                                             %
    894 %    B l u r                                                                  %
    895 %                                                                             %
    896 %                                                                             %
    897 %                                                                             %
    898 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    899 */
    900 
    901   STRINGIFY(
    902   /*
    903   Reduce image noise and reduce detail levels by row
    904   */
    905   __kernel void BlurRow(const __global CLQuantum *image,
    906     const unsigned int number_channels,const ChannelType channel,
    907     __constant float *filter,const unsigned int width,
    908     const unsigned int imageColumns,const unsigned int imageRows,
    909     __local float4 *temp,__global float4 *tempImage)
    910   {
    911     const int x = get_global_id(0);
    912     const int y = get_global_id(1);
    913 
    914     const int columns = imageColumns;
    915 
    916     const unsigned int radius = (width-1)/2;
    917     const int wsize = get_local_size(0);
    918     const unsigned int loadSize = wsize+width;
    919 
    920     //group coordinate
    921     const int groupX=get_local_size(0)*get_group_id(0);
    922 
    923     //parallel load and clamp
    924     for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
    925     {
    926       int cx = ClampToCanvas(i + groupX - radius, columns);
    927       temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
    928     }
    929 
    930     // barrier
    931     barrier(CLK_LOCAL_MEM_FENCE);
    932 
    933     // only do the work if this is not a patched item
    934     if (get_global_id(0) < columns)
    935     {
    936       // compute
    937       float4 result = (float4) 0;
    938 
    939       int i = 0;
    940 
    941       for ( ; i+7 < width; )
    942       {
    943         for (int j=0; j < 8; j++)
    944           result+=filter[i+j]*temp[i+j+get_local_id(0)];
    945         i+=8;
    946       }
    947 
    948       for ( ; i < width; i++)
    949         result+=filter[i]*temp[i+get_local_id(0)];
    950 
    951       // write back to global
    952       tempImage[y*columns+x] = result;
    953     }
    954   }
    955   )
    956 
    957   STRINGIFY(
    958   /*
    959   Reduce image noise and reduce detail levels by line
    960   */
    961   __kernel void BlurColumn(const __global float4 *blurRowData,
    962     const unsigned int number_channels,const ChannelType channel,
    963     __constant float *filter,const unsigned int width,
    964     const unsigned int imageColumns,const unsigned int imageRows,
    965     __local float4 *temp,__global CLQuantum *filteredImage)
    966   {
    967     const int x = get_global_id(0);
    968     const int y = get_global_id(1);
    969 
    970     const int columns = imageColumns;
    971     const int rows = imageRows;
    972 
    973     unsigned int radius = (width-1)/2;
    974     const int wsize = get_local_size(1);
    975     const unsigned int loadSize = wsize+width;
    976 
    977     //group coordinate
    978     const int groupX=get_local_size(0)*get_group_id(0);
    979     const int groupY=get_local_size(1)*get_group_id(1);
    980     //notice that get_local_size(0) is 1, so
    981     //groupX=get_group_id(0);
    982 
    983     //parallel load and clamp
    984     for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
    985       temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
    986 
    987     // barrier
    988     barrier(CLK_LOCAL_MEM_FENCE);
    989 
    990     // only do the work if this is not a patched item
    991     if (get_global_id(1) < rows)
    992     {
    993       // compute
    994       float4 result = (float4) 0;
    995 
    996       int i = 0;
    997 
    998       for ( ; i+7 < width; )
    999       {
   1000         for (int j=0; j < 8; j++)
   1001           result+=filter[i+j]*temp[i+j+get_local_id(1)];
   1002         i+=8;
   1003       }
   1004 
   1005       for ( ; i < width; i++)
   1006         result+=filter[i]*temp[i+get_local_id(1)];
   1007 
   1008       // write back to global
   1009       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
   1010     }
   1011   }
   1012   )
   1013 
   1014 /*
   1015 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1016 %                                                                             %
   1017 %                                                                             %
   1018 %                                                                             %
   1019 %    C o n t r a s t                                                          %
   1020 %                                                                             %
   1021 %                                                                             %
   1022 %                                                                             %
   1023 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1024 */
   1025 
   1026   STRINGIFY(
   1027 
   1028   inline float3 ConvertRGBToHSB(CLPixelType pixel) {
   1029     float3 HueSaturationBrightness;
   1030     HueSaturationBrightness.x = 0.0f; // Hue
   1031     HueSaturationBrightness.y = 0.0f; // Saturation
   1032     HueSaturationBrightness.z = 0.0f; // Brightness
   1033 
   1034     float r=(float) getRed(pixel);
   1035     float g=(float) getGreen(pixel);
   1036     float b=(float) getBlue(pixel);
   1037 
   1038     float tmin=MagickMin(MagickMin(r,g),b);
   1039     float tmax=MagickMax(MagickMax(r,g),b);
   1040 
   1041     if (tmax!=0.0f) {
   1042       float delta=tmax-tmin;
   1043       HueSaturationBrightness.y=delta/tmax;
   1044       HueSaturationBrightness.z=QuantumScale*tmax;
   1045 
   1046       if (delta != 0.0f) {
   1047   HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
   1048   HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
   1049         HueSaturationBrightness.x/=6.0f;
   1050         HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
   1051       }
   1052     }
   1053     return HueSaturationBrightness;
   1054   }
   1055 
   1056   inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
   1057 
   1058     float hue = HueSaturationBrightness.x;
   1059     float brightness = HueSaturationBrightness.z;
   1060     float saturation = HueSaturationBrightness.y;
   1061 
   1062     CLPixelType rgb;
   1063 
   1064     if (saturation == 0.0f) {
   1065       setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
   1066       setGreen(&rgb,getRed(rgb));
   1067       setBlue(&rgb,getRed(rgb));
   1068     }
   1069     else {
   1070 
   1071       float h=6.0f*(hue-floor(hue));
   1072       float f=h-floor(h);
   1073       float p=brightness*(1.0f-saturation);
   1074       float q=brightness*(1.0f-saturation*f);
   1075       float t=brightness*(1.0f-(saturation*(1.0f-f)));
   1076 
   1077       float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
   1078       float clamped_t = ClampToQuantum(QuantumRange*t);
   1079       float clamped_p = ClampToQuantum(QuantumRange*p);
   1080       float clamped_q = ClampToQuantum(QuantumRange*q);
   1081       int ih = (int)h;
   1082       setRed(&rgb, (ih == 1)?clamped_q:
   1083         (ih == 2 || ih == 3)?clamped_p:
   1084         (ih == 4)?clamped_t:
   1085                  clampedBrightness);
   1086 
   1087       setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
   1088         (ih == 3)?clamped_q:
   1089         (ih == 4 || ih == 5)?clamped_p:
   1090                  clamped_t);
   1091 
   1092       setBlue(&rgb, (ih == 2)?clamped_t:
   1093         (ih == 3 || ih == 4)?clampedBrightness:
   1094         (ih == 5)?clamped_q:
   1095                  clamped_p);
   1096     }
   1097     return rgb;
   1098   }
   1099 
   1100   __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
   1101   {
   1102 
   1103     const int sign = sharpen!=0?1:-1;
   1104     const int x = get_global_id(0);
   1105     const int y = get_global_id(1);
   1106     const int columns = get_global_size(0);
   1107     const int c = x + y * columns;
   1108 
   1109     CLPixelType pixel = im[c];
   1110     float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
   1111     float brightness = HueSaturationBrightness.z;
   1112     brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
   1113     brightness = clamp(brightness,0.0f,1.0f);
   1114     HueSaturationBrightness.z = brightness;
   1115 
   1116     CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
   1117     filteredPixel.w = pixel.w;
   1118     im[c] = filteredPixel;
   1119   }
   1120   )
   1121 
   1122 /*
   1123 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1124 %                                                                             %
   1125 %                                                                             %
   1126 %                                                                             %
   1127 %    C o n t r a s t S t r e t c h                                            %
   1128 %                                                                             %
   1129 %                                                                             %
   1130 %                                                                             %
   1131 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1132 */
   1133 
   1134     STRINGIFY(
   1135     /*
   1136     */
   1137     __kernel void Histogram(__global CLPixelType * restrict im,
   1138       const ChannelType channel,
   1139       const unsigned int colorspace,
   1140       const unsigned int method,
   1141       __global uint4 * restrict histogram)
   1142       {
   1143         const int x = get_global_id(0);
   1144         const int y = get_global_id(1);
   1145         const int columns = get_global_size(0);
   1146         const int c = x + y * columns;
   1147         if ((channel & SyncChannels) != 0)
   1148         {
   1149           float red=(float)getRed(im[c]);
   1150           float green=(float)getGreen(im[c]);
   1151           float blue=(float)getBlue(im[c]);
   1152 
   1153           float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
   1154           uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
   1155           atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
   1156         }
   1157         else
   1158         {
   1159           // for equalizing, we always need all channels?
   1160           // otherwise something more
   1161         }
   1162       }
   1163     )
   1164 
   1165     STRINGIFY(
   1166     /*
   1167     */
   1168     __kernel void ContrastStretch(__global CLPixelType * restrict im,
   1169       const ChannelType channel,
   1170       __global CLPixelType * restrict stretch_map,
   1171       const float4 white, const float4 black)
   1172       {
   1173         const int x = get_global_id(0);
   1174         const int y = get_global_id(1);
   1175         const int columns = get_global_size(0);
   1176         const int c = x + y * columns;
   1177 
   1178         uint ePos;
   1179         CLPixelType oValue, eValue;
   1180         CLQuantum red, green, blue, alpha;
   1181 
   1182         //read from global
   1183         oValue=im[c];
   1184 
   1185         if ((channel & RedChannel) != 0)
   1186         {
   1187           if (getRedF4(white) != getRedF4(black))
   1188           {
   1189             ePos = ScaleQuantumToMap(getRed(oValue));
   1190             eValue = stretch_map[ePos];
   1191             red = getRed(eValue);
   1192           }
   1193         }
   1194 
   1195         if ((channel & GreenChannel) != 0)
   1196         {
   1197           if (getGreenF4(white) != getGreenF4(black))
   1198           {
   1199             ePos = ScaleQuantumToMap(getGreen(oValue));
   1200             eValue = stretch_map[ePos];
   1201             green = getGreen(eValue);
   1202           }
   1203         }
   1204 
   1205         if ((channel & BlueChannel) != 0)
   1206         {
   1207           if (getBlueF4(white) != getBlueF4(black))
   1208           {
   1209             ePos = ScaleQuantumToMap(getBlue(oValue));
   1210             eValue = stretch_map[ePos];
   1211             blue = getBlue(eValue);
   1212           }
   1213         }
   1214 
   1215         if ((channel & AlphaChannel) != 0)
   1216         {
   1217           if (getAlphaF4(white) != getAlphaF4(black))
   1218           {
   1219             ePos = ScaleQuantumToMap(getAlpha(oValue));
   1220             eValue = stretch_map[ePos];
   1221             alpha = getAlpha(eValue);
   1222           }
   1223         }
   1224 
   1225         //write back
   1226         im[c]=(CLPixelType)(blue, green, red, alpha);
   1227 
   1228       }
   1229     )
   1230 
   1231 /*
   1232 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1233 %                                                                             %
   1234 %                                                                             %
   1235 %                                                                             %
   1236 %    C o n v o l v e                                                          %
   1237 %                                                                             %
   1238 %                                                                             %
   1239 %                                                                             %
   1240 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1241 */
   1242 
   1243   STRINGIFY(
   1244     __kernel
   1245     void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
   1246     const unsigned int imageWidth, const unsigned int imageHeight,
   1247     __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
   1248     const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
   1249 
   1250       int2 blockID;
   1251       blockID.x = get_global_id(0) / get_local_size(0);
   1252       blockID.y = get_global_id(1) / get_local_size(1);
   1253 
   1254       // image area processed by this workgroup
   1255       int2 imageAreaOrg;
   1256       imageAreaOrg.x = blockID.x * get_local_size(0);
   1257       imageAreaOrg.y = blockID.y * get_local_size(1);
   1258 
   1259       int2 midFilterDimen;
   1260       midFilterDimen.x = (filterWidth-1)/2;
   1261       midFilterDimen.y = (filterHeight-1)/2;
   1262 
   1263       int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
   1264 
   1265       // dimension of the local cache
   1266       int2 cachedAreaDimen;
   1267       cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
   1268       cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
   1269 
   1270       // cache the pixels accessed by this workgroup in local memory
   1271       int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
   1272       int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
   1273       int groupSize = get_local_size(0) * get_local_size(1);
   1274       for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
   1275 
   1276         int2 cachedAreaIndex;
   1277         cachedAreaIndex.x = i % cachedAreaDimen.x;
   1278         cachedAreaIndex.y = i / cachedAreaDimen.x;
   1279 
   1280         int2 imagePixelIndex;
   1281         imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
   1282 
   1283         // only support EdgeVirtualPixelMethod through ClampToCanvas
   1284         // TODO: implement other virtual pixel method
   1285         imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
   1286         imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
   1287 
   1288         pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
   1289       }
   1290 
   1291       // cache the filter
   1292       for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
   1293         filterCache[i] = filter[i];
   1294       }
   1295       barrier(CLK_LOCAL_MEM_FENCE);
   1296 
   1297 
   1298       int2 imageIndex;
   1299       imageIndex.x = imageAreaOrg.x + get_local_id(0);
   1300       imageIndex.y = imageAreaOrg.y + get_local_id(1);
   1301 
   1302       // if out-of-range, stops here and quit
   1303       if (imageIndex.x >= imageWidth
   1304         || imageIndex.y >= imageHeight) {
   1305           return;
   1306       }
   1307 
   1308       int filterIndex = 0;
   1309       float4 sum = (float4)0.0f;
   1310       float gamma = 0.0f;
   1311       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
   1312         int cacheIndexY = get_local_id(1);
   1313         for (int j = 0; j < filterHeight; j++) {
   1314           int cacheIndexX = get_local_id(0);
   1315           for (int i = 0; i < filterWidth; i++) {
   1316             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
   1317             float f = filterCache[filterIndex];
   1318 
   1319             sum.x += f * p.x;
   1320             sum.y += f * p.y;
   1321             sum.z += f * p.z;
   1322             sum.w += f * p.w;
   1323 
   1324             gamma += f;
   1325             filterIndex++;
   1326             cacheIndexX++;
   1327           }
   1328           cacheIndexY++;
   1329         }
   1330       }
   1331       else {
   1332         int cacheIndexY = get_local_id(1);
   1333         for (int j = 0; j < filterHeight; j++) {
   1334           int cacheIndexX = get_local_id(0);
   1335           for (int i = 0; i < filterWidth; i++) {
   1336 
   1337             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
   1338             float alpha = QuantumScale*p.w;
   1339             float f = filterCache[filterIndex];
   1340             float g = alpha * f;
   1341 
   1342             sum.x += g*p.x;
   1343             sum.y += g*p.y;
   1344             sum.z += g*p.z;
   1345             sum.w += f*p.w;
   1346 
   1347             gamma += g;
   1348             filterIndex++;
   1349             cacheIndexX++;
   1350           }
   1351           cacheIndexY++;
   1352         }
   1353         gamma = PerceptibleReciprocal(gamma);
   1354         sum.xyz = gamma*sum.xyz;
   1355       }
   1356       CLPixelType outputPixel;
   1357       outputPixel.x = ClampToQuantum(sum.x);
   1358       outputPixel.y = ClampToQuantum(sum.y);
   1359       outputPixel.z = ClampToQuantum(sum.z);
   1360       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
   1361 
   1362       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
   1363     }
   1364   )
   1365 
   1366   STRINGIFY(
   1367     __kernel
   1368     void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
   1369                   const uint imageWidth, const uint imageHeight,
   1370                   __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
   1371                   const uint matte, const ChannelType channel) {
   1372 
   1373       int2 imageIndex;
   1374       imageIndex.x = get_global_id(0);
   1375       imageIndex.y = get_global_id(1);
   1376 
   1377       /*
   1378       unsigned int imageWidth = get_global_size(0);
   1379       unsigned int imageHeight = get_global_size(1);
   1380       */
   1381       if (imageIndex.x >= imageWidth
   1382           || imageIndex.y >= imageHeight)
   1383           return;
   1384 
   1385       int2 midFilterDimen;
   1386       midFilterDimen.x = (filterWidth-1)/2;
   1387       midFilterDimen.y = (filterHeight-1)/2;
   1388 
   1389       int filterIndex = 0;
   1390       float4 sum = (float4)0.0f;
   1391       float gamma = 0.0f;
   1392       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
   1393         for (int j = 0; j < filterHeight; j++) {
   1394           int2 inputPixelIndex;
   1395           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
   1396           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
   1397           for (int i = 0; i < filterWidth; i++) {
   1398             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
   1399             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
   1400 
   1401             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
   1402             float f = filter[filterIndex];
   1403 
   1404             sum.x += f * p.x;
   1405             sum.y += f * p.y;
   1406             sum.z += f * p.z;
   1407             sum.w += f * p.w;
   1408 
   1409             gamma += f;
   1410 
   1411             filterIndex++;
   1412           }
   1413         }
   1414       }
   1415       else {
   1416 
   1417         for (int j = 0; j < filterHeight; j++) {
   1418           int2 inputPixelIndex;
   1419           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
   1420           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
   1421           for (int i = 0; i < filterWidth; i++) {
   1422             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
   1423             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
   1424 
   1425             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
   1426             float alpha = QuantumScale*p.w;
   1427             float f = filter[filterIndex];
   1428             float g = alpha * f;
   1429 
   1430             sum.x += g*p.x;
   1431             sum.y += g*p.y;
   1432             sum.z += g*p.z;
   1433             sum.w += f*p.w;
   1434 
   1435             gamma += g;
   1436 
   1437 
   1438             filterIndex++;
   1439           }
   1440         }
   1441         gamma = PerceptibleReciprocal(gamma);
   1442         sum.xyz = gamma*sum.xyz;
   1443       }
   1444 
   1445       CLPixelType outputPixel;
   1446       outputPixel.x = ClampToQuantum(sum.x);
   1447       outputPixel.y = ClampToQuantum(sum.y);
   1448       outputPixel.z = ClampToQuantum(sum.z);
   1449       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
   1450 
   1451       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
   1452     }
   1453   )
   1454 
   1455 /*
   1456 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1457 %                                                                             %
   1458 %                                                                             %
   1459 %                                                                             %
   1460 %     D e s p e c k l e                                                       %
   1461 %                                                                             %
   1462 %                                                                             %
   1463 %                                                                             %
   1464 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1465 */
   1466 
   1467   STRINGIFY(
   1468 
   1469   __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
   1470   , const unsigned int imageWidth, const unsigned int imageHeight
   1471   , const int2 offset, const int polarity, const int matte) {
   1472 
   1473     int x = get_global_id(0);
   1474     int y = get_global_id(1);
   1475 
   1476     CLPixelType v = inputImage[y*imageWidth+x];
   1477 
   1478     int2 neighbor;
   1479     neighbor.y = y + offset.y;
   1480     neighbor.x = x + offset.x;
   1481 
   1482     int2 clampedNeighbor;
   1483     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
   1484     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
   1485 
   1486     CLPixelType r = (clampedNeighbor.x == neighbor.x
   1487                      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
   1488     :(CLPixelType)0;
   1489 
   1490     int sv[4];
   1491     sv[0] = (int)v.x;
   1492     sv[1] = (int)v.y;
   1493     sv[2] = (int)v.z;
   1494     sv[3] = (int)v.w;
   1495 
   1496     int sr[4];
   1497     sr[0] = (int)r.x;
   1498     sr[1] = (int)r.y;
   1499     sr[2] = (int)r.z;
   1500     sr[3] = (int)r.w;
   1501 
   1502     if (polarity > 0) {
   1503       \n #pragma unroll 4\n
   1504       for (unsigned int i = 0; i < 4; i++) {
   1505         sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
   1506       }
   1507     }
   1508     else {
   1509       \n #pragma unroll 4\n
   1510       for (unsigned int i = 0; i < 4; i++) {
   1511         sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
   1512       }
   1513 
   1514     }
   1515 
   1516     v.x = (CLQuantum)sv[0];
   1517     v.y = (CLQuantum)sv[1];
   1518     v.z = (CLQuantum)sv[2];
   1519 
   1520     if (matte!=0)
   1521       v.w = (CLQuantum)sv[3];
   1522 
   1523     outputImage[y*imageWidth+x] = v;
   1524 
   1525     }
   1526 
   1527 
   1528   )
   1529 
   1530   STRINGIFY(
   1531 
   1532   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
   1533   , const unsigned int imageWidth, const unsigned int imageHeight
   1534   , const int2 offset, const int polarity, const int matte) {
   1535 
   1536     int x = get_global_id(0);
   1537     int y = get_global_id(1);
   1538 
   1539     CLPixelType v = inputImage[y*imageWidth+x];
   1540 
   1541     int2 neighbor, clampedNeighbor;
   1542 
   1543     neighbor.y = y + offset.y;
   1544     neighbor.x = x + offset.x;
   1545     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
   1546     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
   1547 
   1548     CLPixelType r = (clampedNeighbor.x == neighbor.x
   1549       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
   1550     :(CLPixelType)0;
   1551 
   1552 
   1553     neighbor.y = y - offset.y;
   1554     neighbor.x = x - offset.x;
   1555     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
   1556     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
   1557 
   1558     CLPixelType s = (clampedNeighbor.x == neighbor.x
   1559       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
   1560     :(CLPixelType)0;
   1561 
   1562 
   1563     int sv[4];
   1564     sv[0] = (int)v.x;
   1565     sv[1] = (int)v.y;
   1566     sv[2] = (int)v.z;
   1567     sv[3] = (int)v.w;
   1568 
   1569     int sr[4];
   1570     sr[0] = (int)r.x;
   1571     sr[1] = (int)r.y;
   1572     sr[2] = (int)r.z;
   1573     sr[3] = (int)r.w;
   1574 
   1575     int ss[4];
   1576     ss[0] = (int)s.x;
   1577     ss[1] = (int)s.y;
   1578     ss[2] = (int)s.z;
   1579     ss[3] = (int)s.w;
   1580 
   1581     if (polarity > 0) {
   1582       \n #pragma unroll 4\n
   1583       for (unsigned int i = 0; i < 4; i++) {
   1584         //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
   1585         //
   1586         //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
   1587         //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
   1588         sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
   1589       }
   1590     }
   1591     else {
   1592       \n #pragma unroll 4\n
   1593       for (unsigned int i = 0; i < 4; i++) {
   1594         //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
   1595         //
   1596         //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
   1597         sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
   1598       }
   1599     }
   1600 
   1601     v.x = (CLQuantum)sv[0];
   1602     v.y = (CLQuantum)sv[1];
   1603     v.z = (CLQuantum)sv[2];
   1604 
   1605     if (matte!=0)
   1606       v.w = (CLQuantum)sv[3];
   1607 
   1608     outputImage[y*imageWidth+x] = v;
   1609 
   1610     }
   1611   )
   1612 
   1613 /*
   1614 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1615 %                                                                             %
   1616 %                                                                             %
   1617 %                                                                             %
   1618 %     E q u a l i z e                                                         %
   1619 %                                                                             %
   1620 %                                                                             %
   1621 %                                                                             %
   1622 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1623 */
   1624 
   1625     STRINGIFY(
   1626     /*
   1627     */
   1628     __kernel void Equalize(__global CLPixelType * restrict im,
   1629       const ChannelType channel,
   1630       __global CLPixelType * restrict equalize_map,
   1631       const float4 white, const float4 black)
   1632       {
   1633         const int x = get_global_id(0);
   1634         const int y = get_global_id(1);
   1635         const int columns = get_global_size(0);
   1636         const int c = x + y * columns;
   1637 
   1638         uint ePos;
   1639         CLPixelType oValue, eValue;
   1640         CLQuantum red, green, blue, alpha;
   1641 
   1642         //read from global
   1643         oValue=im[c];
   1644 
   1645         if ((channel & SyncChannels) != 0)
   1646         {
   1647           if (getRedF4(white) != getRedF4(black))
   1648           {
   1649             ePos = ScaleQuantumToMap(getRed(oValue));
   1650             eValue = equalize_map[ePos];
   1651             red = getRed(eValue);
   1652             ePos = ScaleQuantumToMap(getGreen(oValue));
   1653             eValue = equalize_map[ePos];
   1654             green = getRed(eValue);
   1655             ePos = ScaleQuantumToMap(getBlue(oValue));
   1656             eValue = equalize_map[ePos];
   1657             blue = getRed(eValue);
   1658             ePos = ScaleQuantumToMap(getAlpha(oValue));
   1659             eValue = equalize_map[ePos];
   1660             alpha = getRed(eValue);
   1661 
   1662             //write back
   1663             im[c]=(CLPixelType)(blue, green, red, alpha);
   1664           }
   1665 
   1666         }
   1667 
   1668         // for equalizing, we always need all channels?
   1669         // otherwise something more
   1670 
   1671      }
   1672     )
   1673 
   1674 /*
   1675 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1676 %                                                                             %
   1677 %                                                                             %
   1678 %                                                                             %
   1679 %     F u n c t i o n                                                         %
   1680 %                                                                             %
   1681 %                                                                             %
   1682 %                                                                             %
   1683 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1684 */
   1685 
   1686   STRINGIFY(
   1687   /*
   1688   apply FunctionImageChannel(braightness-contrast)
   1689   */
   1690   CLQuantum ApplyFunction(float pixel,const MagickFunction function,
   1691     const unsigned int number_parameters,__constant float *parameters)
   1692   {
   1693     float result = 0.0f;
   1694 
   1695     switch (function)
   1696     {
   1697     case PolynomialFunction:
   1698       {
   1699         for (unsigned int i=0; i < number_parameters; i++)
   1700           result = result*QuantumScale*pixel + parameters[i];
   1701         result *= QuantumRange;
   1702         break;
   1703       }
   1704     case SinusoidFunction:
   1705       {
   1706         float  freq,phase,ampl,bias;
   1707         freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
   1708         phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
   1709         ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
   1710         bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
   1711         result = QuantumRange*(ampl*sin(2.0f*MagickPI*
   1712           (freq*QuantumScale*pixel + phase/360.0f)) + bias);
   1713         break;
   1714       }
   1715     case ArcsinFunction:
   1716       {
   1717         float  width,range,center,bias;
   1718         width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
   1719         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
   1720         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
   1721         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
   1722 
   1723         result = 2.0f/width*(QuantumScale*pixel - center);
   1724         result = range/MagickPI*asin(result)+bias;
   1725         result = ( result <= -1.0f ) ? bias - range/2.0f : result;
   1726         result = ( result >= 1.0f ) ? bias + range/2.0f : result;
   1727         result *= QuantumRange;
   1728         break;
   1729       }
   1730     case ArctanFunction:
   1731       {
   1732         float slope,range,center,bias;
   1733         slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
   1734         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
   1735         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
   1736         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
   1737         result = MagickPI*slope*(QuantumScale*pixel-center);
   1738         result = QuantumRange*(range/MagickPI*atan(result) + bias);
   1739         break;
   1740       }
   1741     case UndefinedFunction:
   1742       break;
   1743     }
   1744     return(ClampToQuantum(result));
   1745   }
   1746   )
   1747 
   1748   STRINGIFY(
   1749   /*
   1750   Improve brightness / contrast of the image
   1751   channel : define which channel is improved
   1752   function : the function called to enchance the brightness contrast
   1753   number_parameters : numbers of parameters
   1754   parameters : the parameter
   1755   */
   1756   __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
   1757     const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
   1758     __constant float *parameters)
   1759   {
   1760     const unsigned int x = get_global_id(0);
   1761     const unsigned int y = get_global_id(1);
   1762     const unsigned int columns = get_global_size(0);
   1763     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
   1764 
   1765     float red;
   1766     float green;
   1767     float blue;
   1768     float alpha;
   1769 
   1770     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
   1771 
   1772     if ((channel & RedChannel) != 0)
   1773       red=ApplyFunction(red, function, number_parameters, parameters);
   1774 
   1775     if (number_channels > 2)
   1776       {
   1777         if ((channel & GreenChannel) != 0)
   1778           green=ApplyFunction(green, function, number_parameters, parameters);
   1779 
   1780         if ((channel & BlueChannel) != 0)
   1781           blue=ApplyFunction(blue, function, number_parameters, parameters);
   1782       }
   1783 
   1784     if (((number_channels == 4) || (number_channels == 2)) &&
   1785         ((channel & AlphaChannel) != 0))
   1786       alpha=ApplyFunction(alpha, function, number_parameters, parameters);
   1787 
   1788     WriteChannels(p, number_channels, channel, red, green, blue, alpha);
   1789   }
   1790   )
   1791 
   1792 /*
   1793 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1794 %                                                                             %
   1795 %                                                                             %
   1796 %                                                                             %
   1797 %     G r a y s c a l e                                                       %
   1798 %                                                                             %
   1799 %                                                                             %
   1800 %                                                                             %
   1801 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1802 */
   1803 
   1804   STRINGIFY(
   1805   __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
   1806     const unsigned int colorspace,const unsigned int method)
   1807   {
   1808     const unsigned int x = get_global_id(0);
   1809     const unsigned int y = get_global_id(1);
   1810     const unsigned int columns = get_global_size(0);
   1811     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
   1812 
   1813     float
   1814       blue,
   1815       green,
   1816       red;
   1817 
   1818     red=getPixelRed(p);
   1819     green=getPixelGreen(p);
   1820     blue=getPixelBlue(p);
   1821 
   1822     CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
   1823 
   1824     setPixelRed(p,intensity);
   1825     setPixelGreen(p,intensity);
   1826     setPixelBlue(p,intensity);
   1827   }
   1828   )
   1829 
   1830 /*
   1831 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1832 %                                                                             %
   1833 %                                                                             %
   1834 %                                                                             %
   1835 %     L o c a l C o n t r a s t                                               %
   1836 %                                                                             %
   1837 %                                                                             %
   1838 %                                                                             %
   1839 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1840 */
   1841 
   1842     STRINGIFY(
   1843 
   1844       __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
   1845           const int radius,
   1846           const int imageWidth,
   1847           const int imageHeight)
   1848       {
   1849         const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
   1850 
   1851         int x = get_local_id(0);
   1852         int y = get_global_id(1);
   1853 
   1854         global CLPixelType *src = srcImage + y * imageWidth;
   1855 
   1856         for (int i = x; i < imageWidth; i += get_local_size(0)) {
   1857             float sum = 0.0f;
   1858             float weight = 1.0f;
   1859 
   1860             int j = i - radius;
   1861             while ((j + 7) < i) {
   1862                 for (int k = 0; k < 8; ++k) // Unroll 8x
   1863                     sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
   1864                 weight += 8.0f;
   1865                 j+=8;
   1866             }
   1867             while (j < i) {
   1868                 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
   1869                 weight += 1.0f;
   1870                 ++j;
   1871             }
   1872 
   1873             while ((j + 7) < radius + i) {
   1874                 for (int k = 0; k < 8; ++k) // Unroll 8x
   1875                     sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
   1876                 weight -= 8.0f;
   1877                 j+=8;
   1878             }
   1879             while (j < radius + i) {
   1880                 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
   1881                 weight -= 1.0f;
   1882                 ++j;
   1883             }
   1884 
   1885             tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
   1886         }
   1887       }
   1888     )
   1889 
   1890     STRINGIFY(
   1891       __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
   1892           const int radius,
   1893           const float strength,
   1894           const int imageWidth,
   1895           const int imageHeight)
   1896       {
   1897         const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
   1898 
   1899         int x = get_global_id(0);
   1900         int y = get_global_id(1);
   1901 
   1902         if ((x >= imageWidth) || (y >= imageHeight))
   1903                 return;
   1904 
   1905         global float *src = blurImage + x;
   1906 
   1907         float sum = 0.0f;
   1908         float weight = 1.0f;
   1909 
   1910         int j = y - radius;
   1911         while ((j + 7) < y) {
   1912             for (int k = 0; k < 8; ++k) // Unroll 8x
   1913                 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
   1914             weight += 8.0f;
   1915             j+=8;
   1916         }
   1917         while (j < y) {
   1918             sum += weight * src[mirrorBottom(j) * imageWidth];
   1919             weight += 1.0f;
   1920             ++j;
   1921         }
   1922 
   1923         while ((j + 7) < radius + y) {
   1924             for (int k = 0; k < 8; ++k) // Unroll 8x
   1925                 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
   1926             weight -= 8.0f;
   1927             j+=8;
   1928         }
   1929         while (j < radius + y) {
   1930             sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
   1931             weight -= 1.0f;
   1932             ++j;
   1933         }
   1934 
   1935         CLPixelType pixel = srcImage[x + y * imageWidth];
   1936         float srcVal = dot(RGB, convert_float4(pixel));
   1937         float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
   1938         mult = (srcVal + mult) / srcVal;
   1939 
   1940         pixel.x = ClampToQuantum(pixel.x * mult);
   1941         pixel.y = ClampToQuantum(pixel.y * mult);
   1942         pixel.z = ClampToQuantum(pixel.z * mult);
   1943 
   1944         dstImage[x + y * imageWidth] = pixel;
   1945       }
   1946     )
   1947 
   1948 /*
   1949 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1950 %                                                                             %
   1951 %                                                                             %
   1952 %                                                                             %
   1953 %     M o d u l a t e                                                         %
   1954 %                                                                             %
   1955 %                                                                             %
   1956 %                                                                             %
   1957 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   1958 */
   1959 
   1960   STRINGIFY(
   1961 
   1962   inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
   1963     float *hue, float *saturation, float *lightness)
   1964   {
   1965   float
   1966     c,
   1967     tmax,
   1968     tmin;
   1969 
   1970   /*
   1971      Convert RGB to HSL colorspace.
   1972      */
   1973   tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
   1974   tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
   1975 
   1976   c=tmax-tmin;
   1977 
   1978   *lightness=(tmax+tmin)/2.0;
   1979   if (c <= 0.0)
   1980   {
   1981     *hue=0.0;
   1982     *saturation=0.0;
   1983     return;
   1984   }
   1985 
   1986   if (tmax == (QuantumScale*red))
   1987   {
   1988     *hue=(QuantumScale*green-QuantumScale*blue)/c;
   1989     if ((QuantumScale*green) < (QuantumScale*blue))
   1990       *hue+=6.0;
   1991   }
   1992   else
   1993     if (tmax == (QuantumScale*green))
   1994       *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
   1995     else
   1996       *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
   1997 
   1998   *hue*=60.0/360.0;
   1999   if (*lightness <= 0.5)
   2000     *saturation=c/(2.0*(*lightness));
   2001   else
   2002     *saturation=c/(2.0-2.0*(*lightness));
   2003   }
   2004 
   2005   inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
   2006       CLQuantum *red,CLQuantum *green,CLQuantum *blue)
   2007   {
   2008     float
   2009       b,
   2010       c,
   2011       g,
   2012       h,
   2013       tmin,
   2014       r,
   2015       x;
   2016 
   2017     /*
   2018        Convert HSL to RGB colorspace.
   2019        */
   2020     h=hue*360.0;
   2021     if (lightness <= 0.5)
   2022       c=2.0*lightness*saturation;
   2023     else
   2024       c=(2.0-2.0*lightness)*saturation;
   2025     tmin=lightness-0.5*c;
   2026     h-=360.0*floor(h/360.0);
   2027     h/=60.0;
   2028     x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
   2029     switch ((int) floor(h) % 6)
   2030     {
   2031       case 0:
   2032         {
   2033           r=tmin+c;
   2034           g=tmin+x;
   2035           b=tmin;
   2036           break;
   2037         }
   2038       case 1:
   2039         {
   2040           r=tmin+x;
   2041           g=tmin+c;
   2042           b=tmin;
   2043           break;
   2044         }
   2045       case 2:
   2046         {
   2047           r=tmin;
   2048           g=tmin+c;
   2049           b=tmin+x;
   2050           break;
   2051         }
   2052       case 3:
   2053         {
   2054           r=tmin;
   2055           g=tmin+x;
   2056           b=tmin+c;
   2057           break;
   2058         }
   2059       case 4:
   2060         {
   2061           r=tmin+x;
   2062           g=tmin;
   2063           b=tmin+c;
   2064           break;
   2065         }
   2066       case 5:
   2067         {
   2068           r=tmin+c;
   2069           g=tmin;
   2070           b=tmin+x;
   2071           break;
   2072         }
   2073       default:
   2074         {
   2075           r=0.0;
   2076           g=0.0;
   2077           b=0.0;
   2078         }
   2079     }
   2080     *red=ClampToQuantum(QuantumRange*r);
   2081     *green=ClampToQuantum(QuantumRange*g);
   2082     *blue=ClampToQuantum(QuantumRange*b);
   2083   }
   2084 
   2085   inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
   2086     CLQuantum *red,CLQuantum *green,CLQuantum *blue)
   2087   {
   2088     float
   2089       hue,
   2090       lightness,
   2091       saturation;
   2092 
   2093     /*
   2094     Increase or decrease color lightness, saturation, or hue.
   2095     */
   2096     ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
   2097     hue+=0.5*(0.01*percent_hue-1.0);
   2098     while (hue < 0.0)
   2099       hue+=1.0;
   2100     while (hue >= 1.0)
   2101       hue-=1.0;
   2102     saturation*=0.01*percent_saturation;
   2103     lightness*=0.01*percent_lightness;
   2104     ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
   2105   }
   2106 
   2107   __kernel void Modulate(__global CLPixelType *im,
   2108     const float percent_brightness,
   2109     const float percent_hue,
   2110     const float percent_saturation,
   2111     const int colorspace)
   2112   {
   2113 
   2114     const int x = get_global_id(0);
   2115     const int y = get_global_id(1);
   2116     const int columns = get_global_size(0);
   2117     const int c = x + y * columns;
   2118 
   2119     CLPixelType pixel = im[c];
   2120 
   2121     CLQuantum
   2122         blue,
   2123         green,
   2124         red;
   2125 
   2126     red=getRed(pixel);
   2127     green=getGreen(pixel);
   2128     blue=getBlue(pixel);
   2129 
   2130     switch (colorspace)
   2131     {
   2132       case HSLColorspace:
   2133       default:
   2134         {
   2135           ModulateHSL(percent_hue, percent_saturation, percent_brightness,
   2136               &red, &green, &blue);
   2137         }
   2138 
   2139     }
   2140 
   2141     CLPixelType filteredPixel;
   2142 
   2143     setRed(&filteredPixel, red);
   2144     setGreen(&filteredPixel, green);
   2145     setBlue(&filteredPixel, blue);
   2146     filteredPixel.w = pixel.w;
   2147 
   2148     im[c] = filteredPixel;
   2149   }
   2150   )
   2151 
   2152 /*
   2153 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2154 %                                                                             %
   2155 %                                                                             %
   2156 %                                                                             %
   2157 %     M o t i o n B l u r                                                     %
   2158 %                                                                             %
   2159 %                                                                             %
   2160 %                                                                             %
   2161 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2162 */
   2163 
   2164   STRINGIFY(
   2165     __kernel
   2166     void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
   2167                     const unsigned int imageWidth, const unsigned int imageHeight,
   2168                     const __global float *filter, const unsigned int width, const __global int2* offset,
   2169                     const float4 bias,
   2170                     const ChannelType channel, const unsigned int matte) {
   2171 
   2172       int2 currentPixel;
   2173       currentPixel.x = get_global_id(0);
   2174       currentPixel.y = get_global_id(1);
   2175 
   2176       if (currentPixel.x >= imageWidth
   2177           || currentPixel.y >= imageHeight)
   2178           return;
   2179 
   2180       float4 pixel;
   2181       pixel.x = (float)bias.x;
   2182       pixel.y = (float)bias.y;
   2183       pixel.z = (float)bias.z;
   2184       pixel.w = (float)bias.w;
   2185 
   2186       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
   2187 
   2188         for (int i = 0; i < width; i++) {
   2189           // only support EdgeVirtualPixelMethod through ClampToCanvas
   2190           // TODO: implement other virtual pixel method
   2191           int2 samplePixel = currentPixel + offset[i];
   2192           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
   2193           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
   2194           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
   2195 
   2196           pixel.x += (filter[i] * (float)samplePixelValue.x);
   2197           pixel.y += (filter[i] * (float)samplePixelValue.y);
   2198           pixel.z += (filter[i] * (float)samplePixelValue.z);
   2199           pixel.w += (filter[i] * (float)samplePixelValue.w);
   2200         }
   2201 
   2202         CLPixelType outputPixel;
   2203         outputPixel.x = ClampToQuantum(pixel.x);
   2204         outputPixel.y = ClampToQuantum(pixel.y);
   2205         outputPixel.z = ClampToQuantum(pixel.z);
   2206         outputPixel.w = ClampToQuantum(pixel.w);
   2207         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
   2208       }
   2209       else {
   2210 
   2211         float gamma = 0.0f;
   2212         for (int i = 0; i < width; i++) {
   2213           // only support EdgeVirtualPixelMethod through ClampToCanvas
   2214           // TODO: implement other virtual pixel method
   2215           int2 samplePixel = currentPixel + offset[i];
   2216           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
   2217           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
   2218 
   2219           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
   2220 
   2221           float alpha = QuantumScale*samplePixelValue.w;
   2222           float k = filter[i];
   2223           pixel.x = pixel.x + k * alpha * samplePixelValue.x;
   2224           pixel.y = pixel.y + k * alpha * samplePixelValue.y;
   2225           pixel.z = pixel.z + k * alpha * samplePixelValue.z;
   2226 
   2227           pixel.w += k * alpha * samplePixelValue.w;
   2228 
   2229           gamma+=k*alpha;
   2230         }
   2231         gamma = PerceptibleReciprocal(gamma);
   2232         pixel.xyz = gamma*pixel.xyz;
   2233 
   2234         CLPixelType outputPixel;
   2235         outputPixel.x = ClampToQuantum(pixel.x);
   2236         outputPixel.y = ClampToQuantum(pixel.y);
   2237         outputPixel.z = ClampToQuantum(pixel.z);
   2238         outputPixel.w = ClampToQuantum(pixel.w);
   2239         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
   2240       }
   2241     }
   2242   )
   2243 
   2244 /*
   2245 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2246 %                                                                             %
   2247 %                                                                             %
   2248 %                                                                             %
   2249 %     R e s i z e                                                             %
   2250 %                                                                             %
   2251 %                                                                             %
   2252 %                                                                             %
   2253 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2254 */
   2255 
   2256   STRINGIFY(
   2257   // Based on Box from resize.c
   2258   float BoxResizeFilter(const float x)
   2259   {
   2260     return 1.0f;
   2261   }
   2262   )
   2263 
   2264   STRINGIFY(
   2265   // Based on CubicBC from resize.c
   2266   float CubicBC(const float x,const __global float* resizeFilterCoefficients)
   2267   {
   2268     /*
   2269     Cubic Filters using B,C determined values:
   2270     Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
   2271     Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
   2272     Spline              B = 1   C = 0    B-Spline Gaussian approximation
   2273     Hermite             B = 0   C = 0    B-Spline interpolator
   2274 
   2275     See paper by Mitchell and Netravali, Reconstruction Filters in Computer
   2276     Graphics Computer Graphics, Volume 22, Number 4, August 1988
   2277     http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
   2278     Mitchell.pdf.
   2279 
   2280     Coefficents are determined from B,C values:
   2281     P0 = (  6 - 2*B       )/6 = coeff[0]
   2282     P1 =         0
   2283     P2 = (-18 +12*B + 6*C )/6 = coeff[1]
   2284     P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
   2285     Q0 = (      8*B +24*C )/6 = coeff[3]
   2286     Q1 = (    -12*B -48*C )/6 = coeff[4]
   2287     Q2 = (      6*B +30*C )/6 = coeff[5]
   2288     Q3 = (    - 1*B - 6*C )/6 = coeff[6]
   2289 
   2290     which are used to define the filter:
   2291 
   2292     P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
   2293     Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
   2294 
   2295     which ensures function is continuous in value and derivative (slope).
   2296     */
   2297     if (x < 1.0)
   2298       return(resizeFilterCoefficients[0]+x*(x*
   2299       (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
   2300     if (x < 2.0)
   2301       return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
   2302       (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
   2303     return(0.0);
   2304   }
   2305   )
   2306 
   2307   STRINGIFY(
   2308   float Sinc(const float x)
   2309   {
   2310     if (x != 0.0f)
   2311     {
   2312       const float alpha=(float) (MagickPI*x);
   2313       return sinpi(x)/alpha;
   2314     }
   2315     return(1.0f);
   2316   }
   2317   )
   2318 
   2319   STRINGIFY(
   2320   float Triangle(const float x)
   2321   {
   2322     /*
   2323     1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
   2324     a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
   2325     for Sinc().
   2326     */
   2327     return ((x<1.0f)?(1.0f-x):0.0f);
   2328   }
   2329   )
   2330 
   2331 
   2332   STRINGIFY(
   2333   float Hann(const float x)
   2334   {
   2335     /*
   2336     Cosine window function:
   2337       0.5+0.5*cos(pi*x).
   2338     */
   2339     const float cosine=cos((MagickPI*x));
   2340     return(0.5f+0.5f*cosine);
   2341   }
   2342   )
   2343 
   2344   STRINGIFY(
   2345   float Hamming(const float x)
   2346   {
   2347     /*
   2348       Offset cosine window function:
   2349        .54 + .46 cos(pi x).
   2350     */
   2351     const float cosine=cos((MagickPI*x));
   2352     return(0.54f+0.46f*cosine);
   2353   }
   2354   )
   2355 
   2356   STRINGIFY(
   2357   float Blackman(const float x)
   2358   {
   2359     /*
   2360       Blackman: 2nd order cosine windowing function:
   2361         0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
   2362 
   2363       Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
   2364       five flops.
   2365     */
   2366     const float cosine=cos((MagickPI*x));
   2367     return(0.34f+cosine*(0.5f+cosine*0.16f));
   2368   }
   2369   )
   2370 
   2371   STRINGIFY(
   2372   inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
   2373   {
   2374     switch (filterType)
   2375     {
   2376     /* Call Sinc even for SincFast to get better precision on GPU
   2377        and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
   2378     case SincWeightingFunction:
   2379     case SincFastWeightingFunction:
   2380       return Sinc(x);
   2381     case CubicBCWeightingFunction:
   2382       return CubicBC(x,filterCoefficients);
   2383     case BoxWeightingFunction:
   2384       return BoxResizeFilter(x);
   2385     case TriangleWeightingFunction:
   2386       return Triangle(x);
   2387     case HannWeightingFunction:
   2388       return Hann(x);
   2389     case HammingWeightingFunction:
   2390       return Hamming(x);
   2391     case BlackmanWeightingFunction:
   2392       return Blackman(x);
   2393 
   2394     default:
   2395       return 0.0f;
   2396     }
   2397   }
   2398   )
   2399 
   2400 
   2401   STRINGIFY(
   2402   inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
   2403            , const ResizeWeightingFunctionType resizeWindowType
   2404            , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
   2405   {
   2406     float scale;
   2407     float xBlur = fabs(x/resizeFilterBlur);
   2408     if (resizeWindowSupport < MagickEpsilon
   2409         || resizeWindowType == BoxWeightingFunction)
   2410     {
   2411       scale = 1.0f;
   2412     }
   2413     else
   2414     {
   2415       scale = resizeFilterScale;
   2416       scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
   2417     }
   2418     float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
   2419     return weight;
   2420   }
   2421 
   2422   )
   2423 
   2424   ;
   2425   const char *accelerateKernels2 =
   2426 
   2427   STRINGIFY(
   2428 
   2429   inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
   2430     return (numWorkItems/pixelPerWorkgroup);
   2431   }
   2432 
   2433   // returns the index of the pixel for the current workitem to compute.
   2434   // returns -1 if this workitem doesn't need to participate in any computation
   2435   inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
   2436     const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
   2437     int pixelIndex = itemID/numWorkItemsPerPixel;
   2438     pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
   2439     return pixelIndex;
   2440   }
   2441 
   2442   )
   2443 
   2444   STRINGIFY(
   2445   __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
   2446     void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
   2447       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
   2448       const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
   2449       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
   2450       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
   2451       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
   2452       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
   2453       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
   2454   {
   2455     // calculate the range of resized image pixels computed by this workgroup
   2456     const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
   2457     const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
   2458     const unsigned int actualNumPixelToCompute = stopX - startX;
   2459 
   2460     // calculate the range of input image pixels to cache
   2461     float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
   2462     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
   2463     scale = PerceptibleReciprocal(scale);
   2464 
   2465     const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
   2466     const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
   2467 
   2468     // cache the input pixels into local memory
   2469     const unsigned int y = get_global_id(1);
   2470     const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
   2471     const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
   2472     event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
   2473     wait_group_events(1, &e);
   2474 
   2475     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
   2476     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
   2477     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
   2478     {
   2479       const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
   2480       const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
   2481       const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
   2482 
   2483       // determine which resized pixel computed by this workitem
   2484       const unsigned int itemID = get_local_id(0);
   2485       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
   2486 
   2487       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
   2488 
   2489       float4 filteredPixel = (float4)0.0f;
   2490       float density = 0.0f;
   2491       float gamma = 0.0f;
   2492       // -1 means this workitem doesn't participate in the computation
   2493       if (pixelIndex != -1)
   2494       {
   2495         // x coordinated of the resized pixel computed by this workitem
   2496         const int x = chunkStartX + pixelIndex;
   2497 
   2498         // calculate how many steps required for this pixel
   2499         const float bisect = (x+0.5)/xFactor+MagickEpsilon;
   2500         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
   2501         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
   2502         const unsigned int n = stop - start;
   2503 
   2504         // calculate how many steps this workitem will contribute
   2505         unsigned int numStepsPerWorkItem = n / numItems;
   2506         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
   2507 
   2508         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
   2509         if (startStep < n)
   2510         {
   2511           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
   2512 
   2513           unsigned int cacheIndex = start+startStep-cacheRangeStartX;
   2514           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
   2515           {
   2516             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
   2517               (ResizeWeightingFunctionType) resizeFilterType,
   2518               (ResizeWeightingFunctionType) resizeWindowType,
   2519               resizeFilterScale, resizeFilterWindowSupport,
   2520               resizeFilterBlur, scale*(start + i - bisect + 0.5));
   2521 
   2522             float4 cp = (float4)0.0f;
   2523 
   2524             __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
   2525             cp.x = (float) *(p);
   2526             if (number_channels > 2)
   2527             {
   2528               cp.y = (float) *(p + 1);
   2529               cp.z = (float) *(p + 2);
   2530             }
   2531 
   2532             if (alpha_index != 0)
   2533             {
   2534               cp.w = (float) *(p + alpha_index);
   2535 
   2536               float alpha = weight * QuantumScale * cp.w;
   2537 
   2538               filteredPixel.x += alpha * cp.x;
   2539               filteredPixel.y += alpha * cp.y;
   2540               filteredPixel.z += alpha * cp.z;
   2541               filteredPixel.w += weight * cp.w;
   2542               gamma += alpha;
   2543             }
   2544             else
   2545               filteredPixel += ((float4) weight)*cp;
   2546 
   2547             density += weight;
   2548           }
   2549         }
   2550       }
   2551 
   2552       // initialize the accumulators to zero
   2553       if (itemID < actualNumPixelInThisChunk) {
   2554         outputPixelCache[itemID] = (float4)0.0f;
   2555         densityCache[itemID] = 0.0f;
   2556         if (alpha_index != 0)
   2557           gammaCache[itemID] = 0.0f;
   2558       }
   2559       barrier(CLK_LOCAL_MEM_FENCE);
   2560 
   2561       // accumulatte the filtered pixel value and the density
   2562       for (unsigned int i = 0; i < numItems; i++) {
   2563         if (pixelIndex != -1) {
   2564           if (itemID%numItems == i) {
   2565             outputPixelCache[pixelIndex]+=filteredPixel;
   2566             densityCache[pixelIndex]+=density;
   2567             if (alpha_index != 0)
   2568               gammaCache[pixelIndex]+=gamma;
   2569           }
   2570         }
   2571         barrier(CLK_LOCAL_MEM_FENCE);
   2572       }
   2573 
   2574       if (itemID < actualNumPixelInThisChunk)
   2575       {
   2576         float4 filteredPixel = outputPixelCache[itemID];
   2577 
   2578         float gamma = 0.0f;
   2579         if (alpha_index != 0)
   2580           gamma = gammaCache[itemID];
   2581 
   2582         float density = densityCache[itemID];
   2583         if ((density != 0.0f) && (density != 1.0f))
   2584         {
   2585           density = PerceptibleReciprocal(density);
   2586           filteredPixel *= (float4) density;
   2587           if (alpha_index != 0)
   2588             gamma *= density;
   2589         }
   2590 
   2591         if (alpha_index != 0)
   2592         {
   2593           gamma = PerceptibleReciprocal(gamma);
   2594           filteredPixel.x *= gamma;
   2595           filteredPixel.y *= gamma;
   2596           filteredPixel.z *= gamma;
   2597         }
   2598 
   2599         WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
   2600       }
   2601     }
   2602   }
   2603   )
   2604 
   2605 
   2606   STRINGIFY(
   2607  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
   2608     void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
   2609       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
   2610       const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
   2611       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
   2612       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
   2613       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
   2614       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
   2615       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
   2616   {
   2617     // calculate the range of resized image pixels computed by this workgroup
   2618     const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
   2619     const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
   2620     const unsigned int actualNumPixelToCompute = stopY - startY;
   2621 
   2622     // calculate the range of input image pixels to cache
   2623     float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
   2624     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
   2625     scale = PerceptibleReciprocal(scale);
   2626 
   2627     const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
   2628     const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
   2629 
   2630     // cache the input pixels into local memory
   2631     const unsigned int x = get_global_id(0);
   2632     unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
   2633     unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
   2634     unsigned int stride = inputColumns * number_channels;
   2635     for (unsigned int i = 0; i < number_channels; i++)
   2636     {
   2637       event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
   2638       wait_group_events(1,&e);
   2639     }
   2640 
   2641     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
   2642     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
   2643     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
   2644     {
   2645       const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
   2646       const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
   2647       const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
   2648 
   2649       // determine which resized pixel computed by this workitem
   2650       const unsigned int itemID = get_local_id(1);
   2651       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
   2652 
   2653       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
   2654 
   2655       float4 filteredPixel = (float4)0.0f;
   2656       float density = 0.0f;
   2657       float gamma = 0.0f;
   2658       // -1 means this workitem doesn't participate in the computation
   2659       if (pixelIndex != -1)
   2660       {
   2661         // x coordinated of the resized pixel computed by this workitem
   2662         const int y = chunkStartY + pixelIndex;
   2663 
   2664         // calculate how many steps required for this pixel
   2665         const float bisect = (y+0.5)/yFactor+MagickEpsilon;
   2666         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
   2667         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
   2668         const unsigned int n = stop - start;
   2669 
   2670         // calculate how many steps this workitem will contribute
   2671         unsigned int numStepsPerWorkItem = n / numItems;
   2672         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
   2673 
   2674         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
   2675         if (startStep < n)
   2676         {
   2677           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
   2678 
   2679           unsigned int cacheIndex = start+startStep-cacheRangeStartY;
   2680           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
   2681           {
   2682             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
   2683               (ResizeWeightingFunctionType) resizeFilterType,
   2684               (ResizeWeightingFunctionType) resizeWindowType,
   2685               resizeFilterScale, resizeFilterWindowSupport,
   2686               resizeFilterBlur, scale*(start + i - bisect + 0.5));
   2687 
   2688             float4 cp = (float4)0.0f;
   2689 
   2690             __local CLQuantum *p = inputImageCache + cacheIndex;
   2691             cp.x = (float) *(p);
   2692             if (number_channels > 2)
   2693             {
   2694               cp.y = (float) *(p + rangeLength);
   2695               cp.z = (float) *(p + (rangeLength * 2));
   2696             }
   2697 
   2698             if (alpha_index != 0)
   2699             {
   2700               cp.w = (float) *(p + (rangeLength * alpha_index));
   2701 
   2702               float alpha = weight * QuantumScale * cp.w;
   2703 
   2704               filteredPixel.x += alpha * cp.x;
   2705               filteredPixel.y += alpha * cp.y;
   2706               filteredPixel.z += alpha * cp.z;
   2707               filteredPixel.w += weight * cp.w;
   2708               gamma += alpha;
   2709             }
   2710             else
   2711               filteredPixel += ((float4) weight)*cp;
   2712 
   2713             density += weight;
   2714           }
   2715         }
   2716       }
   2717 
   2718       // initialize the accumulators to zero
   2719       if (itemID < actualNumPixelInThisChunk) {
   2720         outputPixelCache[itemID] = (float4)0.0f;
   2721         densityCache[itemID] = 0.0f;
   2722         if (alpha_index != 0)
   2723           gammaCache[itemID] = 0.0f;
   2724       }
   2725       barrier(CLK_LOCAL_MEM_FENCE);
   2726 
   2727       // accumulatte the filtered pixel value and the density
   2728       for (unsigned int i = 0; i < numItems; i++) {
   2729         if (pixelIndex != -1) {
   2730           if (itemID%numItems == i) {
   2731             outputPixelCache[pixelIndex]+=filteredPixel;
   2732             densityCache[pixelIndex]+=density;
   2733             if (alpha_index != 0)
   2734               gammaCache[pixelIndex]+=gamma;
   2735           }
   2736         }
   2737         barrier(CLK_LOCAL_MEM_FENCE);
   2738       }
   2739 
   2740       if (itemID < actualNumPixelInThisChunk)
   2741       {
   2742         float4 filteredPixel = outputPixelCache[itemID];
   2743 
   2744         float gamma = 0.0f;
   2745         if (alpha_index != 0)
   2746           gamma = gammaCache[itemID];
   2747 
   2748         float density = densityCache[itemID];
   2749         if ((density != 0.0f) && (density != 1.0f))
   2750         {
   2751           density = PerceptibleReciprocal(density);
   2752           filteredPixel *= (float4) density;
   2753           if (alpha_index != 0)
   2754             gamma *= density;
   2755         }
   2756 
   2757         if (alpha_index != 0)
   2758         {
   2759           gamma = PerceptibleReciprocal(gamma);
   2760           filteredPixel.x *= gamma;
   2761           filteredPixel.y *= gamma;
   2762           filteredPixel.z *= gamma;
   2763         }
   2764 
   2765         WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
   2766       }
   2767     }
   2768   }
   2769   )
   2770 
   2771 /*
   2772 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2773 %                                                                             %
   2774 %                                                                             %
   2775 %                                                                             %
   2776 %     R o t a t i o n a l B l u r                                             %
   2777 %                                                                             %
   2778 %                                                                             %
   2779 %                                                                             %
   2780 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2781 */
   2782 
   2783   STRINGIFY(
   2784   __kernel void RotationalBlur(const __global CLQuantum *image,
   2785     const unsigned int number_channels,const unsigned int channel,
   2786     const float2 blurCenter,__constant float *cos_theta,
   2787     __constant float *sin_theta,const unsigned int cossin_theta_size,
   2788     __global CLQuantum *filteredImage)
   2789   {
   2790     const int x = get_global_id(0);
   2791     const int y = get_global_id(1);
   2792     const int columns = get_global_size(0);
   2793     const int rows = get_global_size(1);
   2794     unsigned int step = 1;
   2795     float center_x = (float) x - blurCenter.x;
   2796     float center_y = (float) y - blurCenter.y;
   2797     float radius = hypot(center_x, center_y);
   2798 
   2799     float blur_radius = hypot(blurCenter.x, blurCenter.y);
   2800 
   2801     if (radius > MagickEpsilon)
   2802     {
   2803       step = (unsigned int) (blur_radius / radius);
   2804       if (step == 0)
   2805         step = 1;
   2806       if (step >= cossin_theta_size)
   2807         step = cossin_theta_size-1;
   2808     }
   2809 
   2810     float4 result = 0.0f;
   2811     float normalize = 0.0f;
   2812     float gamma = 0.0f;
   2813 
   2814     for (unsigned int i=0; i<cossin_theta_size; i+=step)
   2815     {
   2816       int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
   2817       int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
   2818 
   2819       float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
   2820 
   2821       if ((number_channels == 4) || (number_channels == 2))
   2822       {
   2823         float alpha = (float)(QuantumScale*pixel.w);
   2824 
   2825         gamma += alpha;
   2826 
   2827         result.x += alpha * pixel.x;
   2828         result.y += alpha * pixel.y;
   2829         result.z += alpha * pixel.z;
   2830         result.w += pixel.w;
   2831       }
   2832       else
   2833         result += pixel;
   2834 
   2835       normalize += 1.0f;
   2836     }
   2837 
   2838     normalize = PerceptibleReciprocal(normalize);
   2839 
   2840     if ((number_channels == 4) || (number_channels == 2))
   2841     {
   2842       gamma = PerceptibleReciprocal(gamma);
   2843       result.x *= gamma;
   2844       result.y *= gamma;
   2845       result.z *= gamma;
   2846       result.w *= normalize;
   2847     }
   2848     else
   2849       result *= normalize;
   2850 
   2851     WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
   2852   }
   2853   )
   2854 
   2855 /*
   2856 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2857 %                                                                             %
   2858 %                                                                             %
   2859 %                                                                             %
   2860 %     U n s h a r p M a s k                                                   %
   2861 %                                                                             %
   2862 %                                                                             %
   2863 %                                                                             %
   2864 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2865 */
   2866 
   2867   STRINGIFY(
   2868   __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
   2869     const __global float4 *blurRowData,const unsigned int number_channels,
   2870     const ChannelType channel,const unsigned int columns,
   2871     const unsigned int rows,__local float4* cachedData,
   2872     __local float* cachedFilter,const __global float *filter,
   2873     const unsigned int width,const float gain, const float threshold,
   2874     __global CLQuantum *filteredImage)
   2875   {
   2876     const unsigned int radius = (width-1)/2;
   2877 
   2878     // cache the pixel shared by the workgroup
   2879     const int groupX = get_group_id(0);
   2880     const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
   2881     const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
   2882 
   2883     if ((groupStartY >= 0) && (groupStopY < rows))
   2884     {
   2885       event_t e = async_work_group_strided_copy(cachedData,
   2886         blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
   2887       wait_group_events(1,&e);
   2888     }
   2889     else
   2890     {
   2891       for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
   2892         cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
   2893 
   2894       barrier(CLK_LOCAL_MEM_FENCE);
   2895     }
   2896     // cache the filter as well
   2897     event_t e = async_work_group_copy(cachedFilter,filter,width,0);
   2898     wait_group_events(1,&e);
   2899 
   2900     // only do the work if this is not a patched item
   2901     const int cy = get_global_id(1);
   2902 
   2903     if (cy < rows)
   2904     {
   2905       float4 blurredPixel = (float4) 0.0f;
   2906 
   2907       int i = 0;
   2908 
   2909       for ( ; i+7 < width; )
   2910       {
   2911         for (int j=0; j < 8; j++, i++)
   2912           blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
   2913       }
   2914 
   2915       for ( ; i < width; i++)
   2916         blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
   2917 
   2918       float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
   2919       float4 outputPixel = inputImagePixel - blurredPixel;
   2920 
   2921       float quantumThreshold = QuantumRange*threshold;
   2922 
   2923       int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
   2924       outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
   2925 
   2926       //write back
   2927       WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
   2928     }
   2929   }
   2930   )
   2931 
   2932   STRINGIFY(
   2933   __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
   2934     const ChannelType channel,__constant float *filter,const unsigned int width,
   2935     const unsigned int columns,const unsigned int rows,__local float4 *pixels,
   2936     const float gain,const float threshold,__global CLQuantum *filteredImage)
   2937   {
   2938     const unsigned int x = get_global_id(0);
   2939     const unsigned int y = get_global_id(1);
   2940 
   2941     const unsigned int radius = (width - 1) / 2;
   2942 
   2943     int row = y - radius;
   2944     int baseRow = get_group_id(1) * get_local_size(1) - radius;
   2945     int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
   2946 
   2947     while (row < endRow) {
   2948       int srcy = (row < 0) ? -row : row; // mirror pad
   2949       srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
   2950 
   2951       float4 value = 0.0f;
   2952 
   2953       int ix = x - radius;
   2954       int i = 0;
   2955 
   2956       while (i + 7 < width) {
   2957         for (int j = 0; j < 8; ++j) { // unrolled
   2958           int srcx = ix + j;
   2959           srcx = (srcx < 0) ? -srcx : srcx;
   2960           srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
   2961           value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
   2962         }
   2963         ix += 8;
   2964         i += 8;
   2965       }
   2966 
   2967       while (i < width) {
   2968         int srcx = (ix < 0) ? -ix : ix; // mirror pad
   2969         srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
   2970         value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
   2971         ++i;
   2972         ++ix;
   2973       }
   2974       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
   2975       row += get_local_size(1);
   2976     }
   2977 
   2978     barrier(CLK_LOCAL_MEM_FENCE);
   2979 
   2980     const int px = get_local_id(0);
   2981     const int py = get_local_id(1);
   2982     const int prp = get_local_size(0);
   2983     float4 value = (float4)(0.0f);
   2984 
   2985     int i = 0;
   2986     while (i + 7 < width) {
   2987       for (int j = 0; j < 8; ++j) // unrolled
   2988         value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
   2989       i += 8;
   2990     }
   2991     while (i < width) {
   2992       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
   2993       ++i;
   2994     }
   2995 
   2996     float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
   2997     float4 diff = srcPixel - value;
   2998 
   2999     float quantumThreshold = QuantumRange*threshold;
   3000 
   3001     int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
   3002     value = select(srcPixel + diff * gain, srcPixel, mask);
   3003 
   3004     if ((x < columns) && (y < rows))
   3005       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
   3006   }
   3007   )
   3008 
   3009   STRINGIFY(
   3010     __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
   3011     void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
   3012       const unsigned int number_channels,const unsigned int max_channels,
   3013       const float threshold,const int passes,const unsigned int imageWidth,
   3014       const unsigned int imageHeight)
   3015   {
   3016     const int pad = (1 << (passes - 1));
   3017     const int tileSize = 64;
   3018     const int tileRowPixels = 64;
   3019     const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
   3020 
   3021     CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
   3022 
   3023     local float buffer[64 * 64];
   3024 
   3025     int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0);
   3026     int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad;
   3027 
   3028     for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
   3029       int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
   3030                 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
   3031 
   3032       for (int channel = 0; channel < max_channels; ++channel)
   3033         stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
   3034     }
   3035 
   3036     for (int channel = 0; channel < max_channels; ++channel) {
   3037       // Load LDS
   3038       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
   3039         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
   3040 
   3041       // Process
   3042 
   3043       float tmp[16];
   3044       float accum[16];
   3045       float pixel;
   3046 
   3047       for (int i = 0; i < 16; i++)
   3048         accum[i]=0.0f;
   3049 
   3050       for (int pass = 0; pass < passes; ++pass) {
   3051         const int radius = 1 << pass;
   3052         const int x = get_local_id(0);
   3053         const float thresh = threshold * noise[pass];
   3054 
   3055         // Apply horizontal hat
   3056         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
   3057           const int offset = i * tileRowPixels;
   3058           if (pass == 0)
   3059             tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
   3060           pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
   3061           barrier(CLK_LOCAL_MEM_FENCE);
   3062           buffer[x + offset] = pixel;
   3063         }
   3064         barrier(CLK_LOCAL_MEM_FENCE);
   3065 
   3066         // Apply vertical hat
   3067         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
   3068           pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
   3069           float delta = tmp[i / 4] - pixel;
   3070           tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
   3071           if (delta < -thresh)
   3072             delta += thresh;
   3073           else if (delta > thresh)
   3074             delta -= thresh;
   3075           else
   3076             delta = 0;
   3077           accum[i / 4] += delta;
   3078         }
   3079         barrier(CLK_LOCAL_MEM_FENCE);
   3080 
   3081         if (pass < passes - 1)
   3082           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
   3083             buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
   3084         else  // last pass
   3085           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
   3086             accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
   3087         barrier(CLK_LOCAL_MEM_FENCE);
   3088       }
   3089 
   3090       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
   3091         stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
   3092 
   3093       barrier(CLK_LOCAL_MEM_FENCE);
   3094     }
   3095 
   3096     // Write from stage to output
   3097 
   3098     if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
   3099       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
   3100         if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
   3101           int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
   3102           for (int channel = 0; channel < max_channels; ++channel) {
   3103             dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
   3104           }
   3105         }
   3106       }
   3107     }
   3108   }
   3109   )
   3110 
   3111   ;
   3112 
   3113 #endif // MAGICKCORE_OPENCL_SUPPORT
   3114 
   3115 #if defined(__cplusplus) || defined(c_plusplus)
   3116 }
   3117 #endif
   3118 
   3119 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
   3120