MagickCore  7.1.1-43
Convert, Edit, Or Compose Bitmap Images
accelerate-kernels-private.h
1 /*
2  Copyright @ 1999 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4 
5  You may not use this file except in compliance with the License. You may
6  obtain a copy of the License at
7 
8  https://imagemagick.org/script/license.php
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  MagickCore private kernels for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29  Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE() "\n #""else " " \n"
34 #define OPENCL_ENDIF() "\n #""endif " " \n"
35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 const char *accelerateKernels =
39 
40 /*
41  Define declarations.
42 */
43  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
44  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
45  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
46  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
47  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
48  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
49  OPENCL_DEFINE(SigmaRandom, (attenuate))
50  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
51  OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
52  OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
53  OPENCL_DEFINE(QuantumScale, (1.0/QuantumRange))
54 
55 /*
56  Typedef declarations.
57 */
58  STRINGIFY(
59  typedef enum
60  {
61  UndefinedColorspace,
62  CMYColorspace, /* negated linear RGB colorspace */
63  CMYKColorspace, /* CMY with Black separation */
64  GRAYColorspace, /* Single Channel greyscale (non-linear) image */
65  HCLColorspace,
66  HCLpColorspace,
67  HSBColorspace,
68  HSIColorspace,
69  HSLColorspace,
70  HSVColorspace, /* alias for HSB */
71  HWBColorspace,
72  LabColorspace,
73  LCHColorspace, /* alias for LCHuv */
74  LCHabColorspace, /* Cylindrical (Polar) Lab */
75  LCHuvColorspace, /* Cylindrical (Polar) Luv */
76  LogColorspace,
77  LMSColorspace,
78  LuvColorspace,
79  OHTAColorspace,
80  Rec601YCbCrColorspace,
81  Rec709YCbCrColorspace,
82  RGBColorspace, /* Linear RGB colorspace */
83  scRGBColorspace, /* ??? */
84  sRGBColorspace, /* Default: non-linear sRGB colorspace */
85  TransparentColorspace,
86  xyYColorspace,
87  XYZColorspace, /* IEEE Color Reference colorspace */
88  YCbCrColorspace,
89  YCCColorspace,
90  YDbDrColorspace,
91  YIQColorspace,
92  YPbPrColorspace,
93  YUVColorspace,
94  LinearGRAYColorspace /* Single Channel greyscale (linear) image */
95  } ColorspaceType;
96  )
97 
98  STRINGIFY(
99  typedef enum
100  {
101  UndefinedCompositeOp,
102  AlphaCompositeOp,
103  AtopCompositeOp,
104  BlendCompositeOp,
105  BlurCompositeOp,
106  BumpmapCompositeOp,
107  ChangeMaskCompositeOp,
108  ClearCompositeOp,
109  ColorBurnCompositeOp,
110  ColorDodgeCompositeOp,
111  ColorizeCompositeOp,
112  CopyBlackCompositeOp,
113  CopyBlueCompositeOp,
114  CopyCompositeOp,
115  CopyCyanCompositeOp,
116  CopyGreenCompositeOp,
117  CopyMagentaCompositeOp,
118  CopyAlphaCompositeOp,
119  CopyRedCompositeOp,
120  CopyYellowCompositeOp,
121  DarkenCompositeOp,
122  DarkenIntensityCompositeOp,
123  DifferenceCompositeOp,
124  DisplaceCompositeOp,
125  DissolveCompositeOp,
126  DistortCompositeOp,
127  DivideDstCompositeOp,
128  DivideSrcCompositeOp,
129  DstAtopCompositeOp,
130  DstCompositeOp,
131  DstInCompositeOp,
132  DstOutCompositeOp,
133  DstOverCompositeOp,
134  ExclusionCompositeOp,
135  HardLightCompositeOp,
136  HardMixCompositeOp,
137  HueCompositeOp,
138  InCompositeOp,
139  IntensityCompositeOp,
140  LightenCompositeOp,
141  LightenIntensityCompositeOp,
142  LinearBurnCompositeOp,
143  LinearDodgeCompositeOp,
144  LinearLightCompositeOp,
145  LuminizeCompositeOp,
146  MathematicsCompositeOp,
147  MinusDstCompositeOp,
148  MinusSrcCompositeOp,
149  ModulateCompositeOp,
150  ModulusAddCompositeOp,
151  ModulusSubtractCompositeOp,
152  MultiplyCompositeOp,
153  NoCompositeOp,
154  OutCompositeOp,
155  OverCompositeOp,
156  OverlayCompositeOp,
157  PegtopLightCompositeOp,
158  PinLightCompositeOp,
159  PlusCompositeOp,
160  ReplaceCompositeOp,
161  SaturateCompositeOp,
162  ScreenCompositeOp,
163  SoftLightCompositeOp,
164  SrcAtopCompositeOp,
165  SrcCompositeOp,
166  SrcInCompositeOp,
167  SrcOutCompositeOp,
168  SrcOverCompositeOp,
169  ThresholdCompositeOp,
170  VividLightCompositeOp,
171  XorCompositeOp,
172  StereoCompositeOp
173  } CompositeOperator;
174  )
175 
176  STRINGIFY(
177  typedef enum
178  {
179  UndefinedFunction,
180  ArcsinFunction,
181  ArctanFunction,
182  PolynomialFunction,
183  SinusoidFunction
184  } MagickFunction;
185  )
186 
187  STRINGIFY(
188  typedef enum
189  {
190  UndefinedNoise,
191  UniformNoise,
192  GaussianNoise,
193  MultiplicativeGaussianNoise,
194  ImpulseNoise,
195  LaplacianNoise,
196  PoissonNoise,
197  RandomNoise
198  } NoiseType;
199  )
200 
201  STRINGIFY(
202  typedef enum
203  {
204  UndefinedPixelIntensityMethod = 0,
205  AveragePixelIntensityMethod,
206  BrightnessPixelIntensityMethod,
207  LightnessPixelIntensityMethod,
208  MSPixelIntensityMethod,
209  Rec601LumaPixelIntensityMethod,
210  Rec601LuminancePixelIntensityMethod,
211  Rec709LumaPixelIntensityMethod,
212  Rec709LuminancePixelIntensityMethod,
213  RMSPixelIntensityMethod
214  } PixelIntensityMethod;
215  )
216 
217  STRINGIFY(
218  typedef enum
219  {
220  BoxWeightingFunction = 0,
221  TriangleWeightingFunction,
222  CubicBCWeightingFunction,
223  HannWeightingFunction,
224  HammingWeightingFunction,
225  BlackmanWeightingFunction,
226  GaussianWeightingFunction,
227  QuadraticWeightingFunction,
228  JincWeightingFunction,
229  SincWeightingFunction,
230  SincFastWeightingFunction,
231  KaiserWeightingFunction,
232  WelchWeightingFunction,
233  BohmanWeightingFunction,
234  LagrangeWeightingFunction,
235  CosineWeightingFunction
236  } ResizeWeightingFunctionType;
237  )
238 
239  STRINGIFY(
240  typedef enum
241  {
242  UndefinedChannel = 0x0000,
243  RedChannel = 0x0001,
244  GrayChannel = 0x0001,
245  CyanChannel = 0x0001,
246  GreenChannel = 0x0002,
247  MagentaChannel = 0x0002,
248  BlueChannel = 0x0004,
249  YellowChannel = 0x0004,
250  BlackChannel = 0x0008,
251  AlphaChannel = 0x0010,
252  OpacityChannel = 0x0010,
253  IndexChannel = 0x0020, /* Color Index Table? */
254  ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */
255  WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */
256  MetaChannel = 0x0100, /* ???? */
257  CompositeChannels = 0x001F,
258  AllChannels = 0x7ffffff, /* 0x7FFFFFFFFFFFFFFF for 64-bit channel masks */
259  /*
260  Special purpose channel types.
261  FUTURE: are these needed any more - they are more like hacks
262  SyncChannels for example is NOT a real channel but a 'flag'
263  It really says -- "User has not defined channels"
264  Though it does have extra meaning in the "-auto-level" operator
265  */
266  TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
267  RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */
268  GrayChannels = 0x0400,
269  SyncChannels = 0x20000, /* channels modified as a single unit */
270  DefaultChannels = AllChannels
271  } ChannelType; /* must correspond to PixelChannel */
272  )
273 
274 /*
275  Helper functions.
276 */
277 
278 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
279 
280  STRINGIFY(
281  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
282  {
283  return((CLQuantum) value);
284  }
285  )
286 
287 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
288 
289  STRINGIFY(
290  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
291  {
292  return((CLQuantum) (257.0f*value));
293  }
294  )
295 
296 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
297 
298  STRINGIFY(
299  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
300  {
301  return((CLQuantum) (16843009.0*value));
302  }
303  )
304 
305 OPENCL_ENDIF()
306 
307 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
308 
309  STRINGIFY(
310  static inline CLQuantum ClampToQuantum(const float value)
311  {
312  return (CLQuantum) clamp(value, 0.0f, QuantumRange);
313  }
314  )
315 
316 OPENCL_ELSE()
317 
318  STRINGIFY(
319  static inline CLQuantum ClampToQuantum(const float value)
320  {
321  return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
322  }
323  )
324 
325 OPENCL_ENDIF()
326 
327  STRINGIFY(
328  static inline int ClampToCanvas(const int offset,const int range)
329  {
330  return clamp(offset, (int)0, range-1);
331  }
332  )
333 
334  STRINGIFY(
335  static inline uint ScaleQuantumToMap(CLQuantum value)
336  {
337  if (value >= (CLQuantum) MaxMap)
338  return ((uint)MaxMap);
339  else
340  return ((uint)value);
341  }
342  )
343 
344  STRINGIFY(
345  static inline float PerceptibleReciprocal(const float x)
346  {
347  float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
348  return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
349  }
350  )
351 
352  STRINGIFY(
353 
354  static inline unsigned int getPixelIndex(const unsigned int number_channels,
355  const unsigned int columns, const unsigned int x, const unsigned int y)
356  {
357  return (x * number_channels) + (y * columns * number_channels);
358  }
359 
360  static inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; }
361  static inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
362  static inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); }
363  static inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
364 
365  static inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; }
366  static inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
367  static inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; }
368  static inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
369 
370  static inline CLQuantum getBlue(CLPixelType p) { return p.x; }
371  static inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
372  static inline float getBlueF4(float4 p) { return p.x; }
373  static inline void setBlueF4(float4* p, float value) { (*p).x = value; }
374 
375  static inline CLQuantum getGreen(CLPixelType p) { return p.y; }
376  static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
377  static inline float getGreenF4(float4 p) { return p.y; }
378  static inline void setGreenF4(float4* p, float value) { (*p).y = value; }
379 
380  static inline CLQuantum getRed(CLPixelType p) { return p.z; }
381  static inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
382  static inline float getRedF4(float4 p) { return p.z; }
383  static inline void setRedF4(float4* p, float value) { (*p).z = value; }
384 
385  static inline CLQuantum getAlpha(CLPixelType p) { return p.w; }
386  static inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
387  static inline float getAlphaF4(float4 p) { return p.w; }
388  static inline void setAlphaF4(float4* p, float value) { (*p).w = value; }
389 
390  static inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
391  const ChannelType channel, float *red, float *green, float *blue, float *alpha)
392  {
393  if ((channel & RedChannel) != 0)
394  *red=getPixelRed(p);
395 
396  if (number_channels > 2)
397  {
398  if ((channel & GreenChannel) != 0)
399  *green=getPixelGreen(p);
400 
401  if ((channel & BlueChannel) != 0)
402  *blue=getPixelBlue(p);
403  }
404 
405  if (((number_channels == 4) || (number_channels == 2)) &&
406  ((channel & AlphaChannel) != 0))
407  *alpha=getPixelAlpha(p,number_channels);
408  }
409 
410  static inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
411  const unsigned int columns, const unsigned int x, const unsigned int y)
412  {
413  const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
414 
415  float4 pixel;
416 
417  pixel.x=getPixelRed(p);
418 
419  if (number_channels > 2)
420  {
421  pixel.y=getPixelGreen(p);
422  pixel.z=getPixelBlue(p);
423  }
424 
425  if ((number_channels == 4) || (number_channels == 2))
426  pixel.w=getPixelAlpha(p,number_channels);
427  return(pixel);
428  }
429 
430  static inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
431  const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
432  {
433  const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
434 
435  float red = 0.0f;
436  float green = 0.0f;
437  float blue = 0.0f;
438  float alpha = 0.0f;
439 
440  ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
441  return (float4)(red, green, blue, alpha);
442  }
443 
444  static inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
445  const ChannelType channel, float red, float green, float blue, float alpha)
446  {
447  if ((channel & RedChannel) != 0)
448  setPixelRed(p,ClampToQuantum(red));
449 
450  if (number_channels > 2)
451  {
452  if ((channel & GreenChannel) != 0)
453  setPixelGreen(p,ClampToQuantum(green));
454 
455  if ((channel & BlueChannel) != 0)
456  setPixelBlue(p,ClampToQuantum(blue));
457  }
458 
459  if (((number_channels == 4) || (number_channels == 2)) &&
460  ((channel & AlphaChannel) != 0))
461  setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
462  }
463 
464  static inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
465  const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
466  {
467  __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
468 
469  setPixelRed(p,ClampToQuantum(pixel.x));
470 
471  if (number_channels > 2)
472  {
473  setPixelGreen(p,ClampToQuantum(pixel.y));
474  setPixelBlue(p,ClampToQuantum(pixel.z));
475  }
476 
477  if ((number_channels == 4) || (number_channels == 2))
478  setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
479  }
480 
481  static inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
482  const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
483  float4 pixel)
484  {
485  __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
486  WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
487  }
488 
489  static inline float GetPixelIntensity(const unsigned int colorspace,
490  const unsigned int method,float red,float green,float blue)
491  {
492  float intensity;
493 
494  if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace))
495  return red;
496 
497  switch (method)
498  {
499  case AveragePixelIntensityMethod:
500  {
501  intensity=(red+green+blue)/3.0;
502  break;
503  }
504  case BrightnessPixelIntensityMethod:
505  {
506  intensity=MagickMax(MagickMax(red,green),blue);
507  break;
508  }
509  case LightnessPixelIntensityMethod:
510  {
511  intensity=(MagickMin(MagickMin(red,green),blue)+
512  MagickMax(MagickMax(red,green),blue))/2.0;
513  break;
514  }
515  case MSPixelIntensityMethod:
516  {
517  intensity=(float) (((float) red*red+green*green+blue*blue)/
518  (3.0*QuantumRange));
519  break;
520  }
521  case Rec601LumaPixelIntensityMethod:
522  {
523  /*
524  if (image->colorspace == RGBColorspace)
525  {
526  red=EncodePixelGamma(red);
527  green=EncodePixelGamma(green);
528  blue=EncodePixelGamma(blue);
529  }
530  */
531  intensity=0.298839*red+0.586811*green+0.114350*blue;
532  break;
533  }
534  case Rec601LuminancePixelIntensityMethod:
535  {
536  /*
537  if (image->colorspace == sRGBColorspace)
538  {
539  red=DecodePixelGamma(red);
540  green=DecodePixelGamma(green);
541  blue=DecodePixelGamma(blue);
542  }
543  */
544  intensity=0.298839*red+0.586811*green+0.114350*blue;
545  break;
546  }
547  case Rec709LumaPixelIntensityMethod:
548  default:
549  {
550  /*
551  if (image->colorspace == RGBColorspace)
552  {
553  red=EncodePixelGamma(red);
554  green=EncodePixelGamma(green);
555  blue=EncodePixelGamma(blue);
556  }
557  */
558  intensity=0.212656*red+0.715158*green+0.072186*blue;
559  break;
560  }
561  case Rec709LuminancePixelIntensityMethod:
562  {
563  /*
564  if (image->colorspace == sRGBColorspace)
565  {
566  red=DecodePixelGamma(red);
567  green=DecodePixelGamma(green);
568  blue=DecodePixelGamma(blue);
569  }
570  */
571  intensity=0.212656*red+0.715158*green+0.072186*blue;
572  break;
573  }
574  case RMSPixelIntensityMethod:
575  {
576  intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
577  sqrt(3.0));
578  break;
579  }
580  }
581 
582  return intensity;
583  }
584 
585  static inline int mirrorBottom(int value)
586  {
587  return (value < 0) ? - (value) : value;
588  }
589 
590  static inline int mirrorTop(int value, int width)
591  {
592  return (value >= width) ? (2 * width - value - 1) : value;
593  }
594  )
595 
596 /*
597 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
598 % %
599 % %
600 % %
601 % A d d N o i s e %
602 % %
603 % %
604 % %
605 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
606 */
607 
608  STRINGIFY(
609  /*
610  Part of MWC64X by David Thomas, dt10@imperial.ac.uk
611  This is provided under BSD, full license is with the main package.
612  See http://www.doc.ic.ac.uk/~dt10/research
613  */
614 
615  // Pre: a<M, b<M
616  // Post: r=(a+b) mod M
617  ulong MWC_AddMod64(ulong a, ulong b, ulong M)
618  {
619  ulong v=a+b;
620  //if( (v>=M) || (v<a) )
621  if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
622  v=v-M;
623  return v;
624  }
625 
626  // Pre: a<M,b<M
627  // Post: r=(a*b) mod M
628  // This could be done more efficiently, but it is portable, and should
629  // be easy to understand. It can be replaced with any of the better
630  // modular multiplication algorithms (for example if you know you have
631  // double precision available or something).
632  ulong MWC_MulMod64(ulong a, ulong b, ulong M)
633  {
634  ulong r=0;
635  while(a!=0){
636  if(a&1)
637  r=MWC_AddMod64(r,b,M);
638  b=MWC_AddMod64(b,b,M);
639  a=a>>1;
640  }
641  return r;
642  }
643 
644  // Pre: a<M, e>=0
645  // Post: r=(a^b) mod M
646  // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
647  // most architectures
648  ulong MWC_PowMod64(ulong a, ulong e, ulong M)
649  {
650  ulong sqr=a, acc=1;
651  while(e!=0){
652  if(e&1)
653  acc=MWC_MulMod64(acc,sqr,M);
654  sqr=MWC_MulMod64(sqr,sqr,M);
655  e=e>>1;
656  }
657  return acc;
658  }
659 
660  uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
661  {
662  ulong m=MWC_PowMod64(A, distance, M);
663  ulong x=curr.x*(ulong)A+curr.y;
664  x=MWC_MulMod64(x, m, M);
665  return (uint2)((uint)(x/A), (uint)(x%A));
666  }
667 
668  uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
669  {
670  // This is an arbitrary constant for starting LCG jumping from. I didn't
671  // want to start from 1, as then you end up with the two or three first values
672  // being a bit poor in ones - once you've decided that, one constant is as
673  // good as any another. There is no deep mathematical reason for it, I just
674  // generated a random number.
675  enum{ MWC_BASEID = 4077358422479273989UL };
676 
677  ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
678  ulong m=MWC_PowMod64(A, dist, M);
679 
680  ulong x=MWC_MulMod64(MWC_BASEID, m, M);
681  return (uint2)((uint)(x/A), (uint)(x%A));
682  }
683 
685  typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
686 
687  void MWC64X_Step(mwc64x_state_t *s)
688  {
689  uint X=s->x, C=s->c;
690 
691  uint Xn=s->seed0*X+C;
692  uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
693  uint Cn=mad_hi(s->seed0,X,carry);
694 
695  s->x=Xn;
696  s->c=Cn;
697  }
698 
699  void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
700  {
701  uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
702  s->x=tmp.x;
703  s->c=tmp.y;
704  }
705 
706  void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
707  {
708  uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
709  s->x=tmp.x;
710  s->c=tmp.y;
711  }
712 
714  uint MWC64X_NextUint(mwc64x_state_t *s)
715  {
716  uint res=s->x ^ s->c;
717  MWC64X_Step(s);
718  return res;
719  }
720 
721  //
722  // End of MWC64X excerpt
723  //
724 
725  float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
726  {
727  return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
728  }
729 
730  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
731  {
732  float
733  alpha,
734  beta,
735  noise,
736  sigma;
737 
738  noise = 0.0f;
739  alpha=mwcReadPseudoRandomValue(r);
740  switch(noise_type)
741  {
742  case UniformNoise:
743  default:
744  {
745  noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
746  break;
747  }
748  case GaussianNoise:
749  {
750  float
751  gamma,
752  tau;
753 
754  if (alpha == 0.0f)
755  alpha=1.0f;
756  beta=mwcReadPseudoRandomValue(r);
757  gamma=sqrt(-2.0f*log(alpha));
758  sigma=gamma*cospi((2.0f*beta));
759  tau=gamma*sinpi((2.0f*beta));
760  noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
761  break;
762  }
763  case ImpulseNoise:
764  {
765  if (alpha < (SigmaImpulse/2.0f))
766  noise=0.0f;
767  else
768  if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
769  noise=QuantumRange;
770  else
771  noise=pixel;
772  break;
773  }
774  case LaplacianNoise:
775  {
776  if (alpha <= 0.5f)
777  {
778  if (alpha <= MagickEpsilon)
779  noise=(pixel-QuantumRange);
780  else
781  noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
782  0.5f);
783  break;
784  }
785  beta=1.0f-alpha;
786  if (beta <= (0.5f*MagickEpsilon))
787  noise=(pixel+QuantumRange);
788  else
789  noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
790  break;
791  }
792  case MultiplicativeGaussianNoise:
793  {
794  sigma=1.0f;
795  if (alpha > MagickEpsilon)
796  sigma=sqrt(-2.0f*log(alpha));
797  beta=mwcReadPseudoRandomValue(r);
798  noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
799  cospi((2.0f*beta))/2.0f);
800  break;
801  }
802  case PoissonNoise:
803  {
804  float
805  poisson;
806  unsigned int i;
807  poisson=exp(-SigmaPoisson*QuantumScale*pixel);
808  for (i=0; alpha > poisson; i++)
809  {
810  beta=mwcReadPseudoRandomValue(r);
811  alpha*=beta;
812  }
813  noise=(QuantumRange*i*PerceptibleReciprocal(SigmaPoisson));
814  break;
815  }
816  case RandomNoise:
817  {
818  noise=(QuantumRange*SigmaRandom*alpha);
819  break;
820  }
821  }
822  return noise;
823  }
824  )
825 
826 /*
827 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
828 % %
829 % %
830 % %
831 % B l u r %
832 % %
833 % %
834 % %
835 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
836 */
837 
838  STRINGIFY(
839  /*
840  Reduce image noise and reduce detail levels by row
841  */
842  __kernel void BlurRow(const __global CLQuantum *image,
843  const unsigned int number_channels,const ChannelType channel,
844  __constant float *filter,const unsigned int width,
845  const unsigned int imageColumns,const unsigned int imageRows,
846  __local float4 *temp,__global float4 *tempImage)
847  {
848  const int x = get_global_id(0);
849  const int y = get_global_id(1);
850 
851  const int columns = imageColumns;
852 
853  const unsigned int radius = (width-1)/2;
854  const int wsize = get_local_size(0);
855  const unsigned int loadSize = wsize+width;
856 
857  //group coordinate
858  const int groupX=get_local_size(0)*get_group_id(0);
859 
860  //parallel load and clamp
861  for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
862  {
863  int cx = ClampToCanvas(i + groupX - radius, columns);
864  temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
865  }
866 
867  // barrier
868  barrier(CLK_LOCAL_MEM_FENCE);
869 
870  // only do the work if this is not a patched item
871  if (get_global_id(0) < columns)
872  {
873  // compute
874  float4 result = (float4) 0;
875 
876  int i = 0;
877 
878  for ( ; i+7 < width; )
879  {
880  for (int j=0; j < 8; j++)
881  result+=filter[i+j]*temp[i+j+get_local_id(0)];
882  i+=8;
883  }
884 
885  for ( ; i < width; i++)
886  result+=filter[i]*temp[i+get_local_id(0)];
887 
888  // write back to global
889  tempImage[y*columns+x] = result;
890  }
891  }
892  )
893 
894  STRINGIFY(
895  /*
896  Reduce image noise and reduce detail levels by line
897  */
898  __kernel void BlurColumn(const __global float4 *blurRowData,
899  const unsigned int number_channels,const ChannelType channel,
900  __constant float *filter,const unsigned int width,
901  const unsigned int imageColumns,const unsigned int imageRows,
902  __local float4 *temp,__global CLQuantum *filteredImage)
903  {
904  const int x = get_global_id(0);
905  const int y = get_global_id(1);
906 
907  const int columns = imageColumns;
908  const int rows = imageRows;
909 
910  unsigned int radius = (width-1)/2;
911  const int wsize = get_local_size(1);
912  const unsigned int loadSize = wsize+width;
913 
914  //group coordinate
915  const int groupX=get_local_size(0)*get_group_id(0);
916  const int groupY=get_local_size(1)*get_group_id(1);
917  //notice that get_local_size(0) is 1, so
918  //groupX=get_group_id(0);
919 
920  //parallel load and clamp
921  for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
922  temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
923 
924  // barrier
925  barrier(CLK_LOCAL_MEM_FENCE);
926 
927  // only do the work if this is not a patched item
928  if (get_global_id(1) < rows)
929  {
930  // compute
931  float4 result = (float4) 0;
932 
933  int i = 0;
934 
935  for ( ; i+7 < width; )
936  {
937  for (int j=0; j < 8; j++)
938  result+=filter[i+j]*temp[i+j+get_local_id(1)];
939  i+=8;
940  }
941 
942  for ( ; i < width; i++)
943  result+=filter[i]*temp[i+get_local_id(1)];
944 
945  // write back to global
946  WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
947  }
948  }
949  )
950 
951 /*
952 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
953 % %
954 % %
955 % %
956 % C o n t r a s t %
957 % %
958 % %
959 % %
960 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
961 */
962 
963  STRINGIFY(
964 
965  static inline float4 ConvertRGBToHSB(const float4 pixel)
966  {
967  float4 result=0.0f;
968  result.w=pixel.w;
969  float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
970  if (tmax != 0.0f)
971  {
972  float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
973  float delta=tmax-tmin;
974 
975  result.y=delta/tmax;
976  result.z=QuantumScale*tmax;
977  if (delta != 0.0f)
978  {
979  result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
980  result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
981  (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
982  result.x/=6.0f;
983  result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
984  }
985  }
986  return(result);
987  }
988 
989  static inline float4 ConvertHSBToRGB(const float4 pixel)
990  {
991  float hue=pixel.x;
992  float saturation=pixel.y;
993  float brightness=pixel.z;
994 
995  float4 result=pixel;
996 
997  if (saturation == 0.0f)
998  {
999  result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
1000  }
1001  else
1002  {
1003  float h=6.0f*(hue-floor(hue));
1004  float f=h-floor(h);
1005  float p=brightness*(1.0f-saturation);
1006  float q=brightness*(1.0f-saturation*f);
1007  float t=brightness*(1.0f-(saturation*(1.0f-f)));
1008  int ih = (int)h;
1009 
1010  if (ih == 1)
1011  {
1012  result.x=ClampToQuantum(QuantumRange*q);
1013  result.y=ClampToQuantum(QuantumRange*brightness);
1014  result.z=ClampToQuantum(QuantumRange*p);
1015  }
1016  else if (ih == 2)
1017  {
1018  result.x=ClampToQuantum(QuantumRange*p);
1019  result.y=ClampToQuantum(QuantumRange*brightness);
1020  result.z=ClampToQuantum(QuantumRange*t);
1021  }
1022  else if (ih == 3)
1023  {
1024  result.x=ClampToQuantum(QuantumRange*p);
1025  result.y=ClampToQuantum(QuantumRange*q);
1026  result.z=ClampToQuantum(QuantumRange*brightness);
1027  }
1028  else if (ih == 4)
1029  {
1030  result.x=ClampToQuantum(QuantumRange*t);
1031  result.y=ClampToQuantum(QuantumRange*p);
1032  result.z=ClampToQuantum(QuantumRange*brightness);
1033  }
1034  else if (ih == 5)
1035  {
1036  result.x=ClampToQuantum(QuantumRange*brightness);
1037  result.y=ClampToQuantum(QuantumRange*p);
1038  result.z=ClampToQuantum(QuantumRange*q);
1039  }
1040  else
1041  {
1042  result.x=ClampToQuantum(QuantumRange*brightness);
1043  result.y=ClampToQuantum(QuantumRange*t);
1044  result.z=ClampToQuantum(QuantumRange*p);
1045  }
1046  }
1047  return(result);
1048  }
1049 
1050  __kernel void Contrast(__global CLQuantum *image,
1051  const unsigned int number_channels,const int sign)
1052  {
1053  const int x=get_global_id(0);
1054  const int y=get_global_id(1);
1055  const unsigned int columns=get_global_size(0);
1056 
1057  float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
1058  if (number_channels < 3)
1059  pixel.y=pixel.z=pixel.x;
1060 
1061  pixel=ConvertRGBToHSB(pixel);
1062  float brightness=pixel.z;
1063  brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1064  brightness=clamp(brightness,0.0f,1.0f);
1065  pixel.z=brightness;
1066  pixel=ConvertHSBToRGB(pixel);
1067 
1068  WriteAllChannels(image,number_channels,columns,x,y,pixel);
1069  }
1070  )
1071 
1072 /*
1073 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1074 % %
1075 % %
1076 % %
1077 % C o n t r a s t S t r e t c h %
1078 % %
1079 % %
1080 % %
1081 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1082 */
1083 
1084  STRINGIFY(
1085  /*
1086  */
1087  __kernel void Histogram(__global CLPixelType * restrict im,
1088  const ChannelType channel,
1089  const unsigned int colorspace,
1090  const unsigned int method,
1091  __global uint4 * restrict histogram)
1092  {
1093  const int x = get_global_id(0);
1094  const int y = get_global_id(1);
1095  const int columns = get_global_size(0);
1096  const int c = x + y * columns;
1097  if ((channel & SyncChannels) != 0)
1098  {
1099  float red=(float)getRed(im[c]);
1100  float green=(float)getGreen(im[c]);
1101  float blue=(float)getBlue(im[c]);
1102 
1103  float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1104  uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1105  atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1106  }
1107  else
1108  {
1109  // for equalizing, we always need all channels?
1110  // otherwise something more
1111  }
1112  }
1113  )
1114 
1115  STRINGIFY(
1116  /*
1117  */
1118  __kernel void ContrastStretch(__global CLPixelType * restrict im,
1119  const ChannelType channel,
1120  __global CLPixelType * restrict stretch_map,
1121  const float4 white, const float4 black)
1122  {
1123  const int x = get_global_id(0);
1124  const int y = get_global_id(1);
1125  const int columns = get_global_size(0);
1126  const int c = x + y * columns;
1127 
1128  uint ePos;
1129  CLPixelType oValue, eValue;
1130  CLQuantum red, green, blue, alpha;
1131 
1132  //read from global
1133  oValue=im[c];
1134 
1135  if ((channel & RedChannel) != 0)
1136  {
1137  if (getRedF4(white) != getRedF4(black))
1138  {
1139  ePos = ScaleQuantumToMap(getRed(oValue));
1140  eValue = stretch_map[ePos];
1141  red = getRed(eValue);
1142  }
1143  }
1144 
1145  if ((channel & GreenChannel) != 0)
1146  {
1147  if (getGreenF4(white) != getGreenF4(black))
1148  {
1149  ePos = ScaleQuantumToMap(getGreen(oValue));
1150  eValue = stretch_map[ePos];
1151  green = getGreen(eValue);
1152  }
1153  }
1154 
1155  if ((channel & BlueChannel) != 0)
1156  {
1157  if (getBlueF4(white) != getBlueF4(black))
1158  {
1159  ePos = ScaleQuantumToMap(getBlue(oValue));
1160  eValue = stretch_map[ePos];
1161  blue = getBlue(eValue);
1162  }
1163  }
1164 
1165  if ((channel & AlphaChannel) != 0)
1166  {
1167  if (getAlphaF4(white) != getAlphaF4(black))
1168  {
1169  ePos = ScaleQuantumToMap(getAlpha(oValue));
1170  eValue = stretch_map[ePos];
1171  alpha = getAlpha(eValue);
1172  }
1173  }
1174 
1175  //write back
1176  im[c]=(CLPixelType)(blue, green, red, alpha);
1177 
1178  }
1179  )
1180 /*
1181 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1182 % %
1183 % %
1184 % %
1185 % D e s p e c k l e %
1186 % %
1187 % %
1188 % %
1189 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1190 */
1191 
1192  STRINGIFY(
1193 
1194  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1195  , const unsigned int imageWidth, const unsigned int imageHeight
1196  , const int2 offset, const int polarity, const int matte) {
1197 
1198  int x = get_global_id(0);
1199  int y = get_global_id(1);
1200 
1201  CLPixelType v = inputImage[y*imageWidth+x];
1202 
1203  int2 neighbor;
1204  neighbor.y = y + offset.y;
1205  neighbor.x = x + offset.x;
1206 
1207  int2 clampedNeighbor;
1208  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1209  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1210 
1211  CLPixelType r = (clampedNeighbor.x == neighbor.x
1212  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1213  :(CLPixelType)0;
1214 
1215  int sv[4];
1216  sv[0] = (int)v.x;
1217  sv[1] = (int)v.y;
1218  sv[2] = (int)v.z;
1219  sv[3] = (int)v.w;
1220 
1221  int sr[4];
1222  sr[0] = (int)r.x;
1223  sr[1] = (int)r.y;
1224  sr[2] = (int)r.z;
1225  sr[3] = (int)r.w;
1226 
1227  if (polarity > 0) {
1228  \n #pragma unroll 4\n
1229  for (unsigned int i = 0; i < 4; i++) {
1230  sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1231  }
1232  }
1233  else {
1234  \n #pragma unroll 4\n
1235  for (unsigned int i = 0; i < 4; i++) {
1236  sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1237  }
1238 
1239  }
1240 
1241  v.x = (CLQuantum)sv[0];
1242  v.y = (CLQuantum)sv[1];
1243  v.z = (CLQuantum)sv[2];
1244 
1245  if (matte!=0)
1246  v.w = (CLQuantum)sv[3];
1247 
1248  outputImage[y*imageWidth+x] = v;
1249 
1250  }
1251 
1252 
1253  )
1254 
1255  STRINGIFY(
1256 
1257  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1258  , const unsigned int imageWidth, const unsigned int imageHeight
1259  , const int2 offset, const int polarity, const int matte) {
1260 
1261  int x = get_global_id(0);
1262  int y = get_global_id(1);
1263 
1264  CLPixelType v = inputImage[y*imageWidth+x];
1265 
1266  int2 neighbor, clampedNeighbor;
1267 
1268  neighbor.y = y + offset.y;
1269  neighbor.x = x + offset.x;
1270  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1271  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1272 
1273  CLPixelType r = (clampedNeighbor.x == neighbor.x
1274  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1275  :(CLPixelType)0;
1276 
1277 
1278  neighbor.y = y - offset.y;
1279  neighbor.x = x - offset.x;
1280  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1281  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1282 
1283  CLPixelType s = (clampedNeighbor.x == neighbor.x
1284  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1285  :(CLPixelType)0;
1286 
1287 
1288  int sv[4];
1289  sv[0] = (int)v.x;
1290  sv[1] = (int)v.y;
1291  sv[2] = (int)v.z;
1292  sv[3] = (int)v.w;
1293 
1294  int sr[4];
1295  sr[0] = (int)r.x;
1296  sr[1] = (int)r.y;
1297  sr[2] = (int)r.z;
1298  sr[3] = (int)r.w;
1299 
1300  int ss[4];
1301  ss[0] = (int)s.x;
1302  ss[1] = (int)s.y;
1303  ss[2] = (int)s.z;
1304  ss[3] = (int)s.w;
1305 
1306  if (polarity > 0) {
1307  \n #pragma unroll 4\n
1308  for (unsigned int i = 0; i < 4; i++) {
1309  //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1310  //
1311  //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1312  //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1313  sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1314  }
1315  }
1316  else {
1317  \n #pragma unroll 4\n
1318  for (unsigned int i = 0; i < 4; i++) {
1319  //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1320  //
1321  //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1322  sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1323  }
1324  }
1325 
1326  v.x = (CLQuantum)sv[0];
1327  v.y = (CLQuantum)sv[1];
1328  v.z = (CLQuantum)sv[2];
1329 
1330  if (matte!=0)
1331  v.w = (CLQuantum)sv[3];
1332 
1333  outputImage[y*imageWidth+x] = v;
1334 
1335  }
1336  )
1337 
1338 /*
1339 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1340 % %
1341 % %
1342 % %
1343 % E q u a l i z e %
1344 % %
1345 % %
1346 % %
1347 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1348 */
1349 
1350  STRINGIFY(
1351  /*
1352  */
1353  __kernel void Equalize(__global CLPixelType * restrict im,
1354  const ChannelType channel,
1355  __global CLPixelType * restrict equalize_map,
1356  const float4 white, const float4 black)
1357  {
1358  const int x = get_global_id(0);
1359  const int y = get_global_id(1);
1360  const int columns = get_global_size(0);
1361  const int c = x + y * columns;
1362 
1363  uint ePos;
1364  CLPixelType oValue, eValue;
1365  CLQuantum red, green, blue, alpha;
1366 
1367  //read from global
1368  oValue=im[c];
1369 
1370  if ((channel & SyncChannels) != 0)
1371  {
1372  if (getRedF4(white) != getRedF4(black))
1373  {
1374  ePos = ScaleQuantumToMap(getRed(oValue));
1375  eValue = equalize_map[ePos];
1376  red = getRed(eValue);
1377  ePos = ScaleQuantumToMap(getGreen(oValue));
1378  eValue = equalize_map[ePos];
1379  green = getRed(eValue);
1380  ePos = ScaleQuantumToMap(getBlue(oValue));
1381  eValue = equalize_map[ePos];
1382  blue = getRed(eValue);
1383  ePos = ScaleQuantumToMap(getAlpha(oValue));
1384  eValue = equalize_map[ePos];
1385  alpha = getRed(eValue);
1386 
1387  //write back
1388  im[c]=(CLPixelType)(blue, green, red, alpha);
1389  }
1390 
1391  }
1392 
1393  // for equalizing, we always need all channels?
1394  // otherwise something more
1395 
1396  }
1397  )
1398 
1399 /*
1400 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1401 % %
1402 % %
1403 % %
1404 % F u n c t i o n %
1405 % %
1406 % %
1407 % %
1408 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1409 */
1410 
1411  STRINGIFY(
1412  /*
1413  apply FunctionImageChannel(brightness-contrast)
1414  */
1415  CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1416  const unsigned int number_parameters,__constant float *parameters)
1417  {
1418  float result = 0.0f;
1419 
1420  switch (function)
1421  {
1422  case PolynomialFunction:
1423  {
1424  for (unsigned int i=0; i < number_parameters; i++)
1425  result = result*QuantumScale*pixel + parameters[i];
1426  result *= QuantumRange;
1427  break;
1428  }
1429  case SinusoidFunction:
1430  {
1431  float freq,phase,ampl,bias;
1432  freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1433  phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1434  ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1435  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1436  result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1437  (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1438  break;
1439  }
1440  case ArcsinFunction:
1441  {
1442  float width,range,center,bias;
1443  width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1444  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1445  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1446  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1447 
1448  result = 2.0f/width*(QuantumScale*pixel - center);
1449  result = range/MagickPI*asin(result)+bias;
1450  result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1451  result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1452  result *= QuantumRange;
1453  break;
1454  }
1455  case ArctanFunction:
1456  {
1457  float slope,range,center,bias;
1458  slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1459  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1460  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1461  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1462  result = MagickPI*slope*(QuantumScale*pixel-center);
1463  result = QuantumRange*(range/MagickPI*atan(result) + bias);
1464  break;
1465  }
1466  case UndefinedFunction:
1467  break;
1468  }
1469  return(ClampToQuantum(result));
1470  }
1471  )
1472 
1473  STRINGIFY(
1474  /*
1475  Improve brightness / contrast of the image
1476  channel : define which channel is improved
1477  function : the function called to enhance the brightness contrast
1478  number_parameters : numbers of parameters
1479  parameters : the parameter
1480  */
1481  __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1482  const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1483  __constant float *parameters)
1484  {
1485  const unsigned int x = get_global_id(0);
1486  const unsigned int y = get_global_id(1);
1487  const unsigned int columns = get_global_size(0);
1488  __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1489 
1490  float red;
1491  float green;
1492  float blue;
1493  float alpha;
1494 
1495  ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1496 
1497  if ((channel & RedChannel) != 0)
1498  red=ApplyFunction(red, function, number_parameters, parameters);
1499 
1500  if (number_channels > 2)
1501  {
1502  if ((channel & GreenChannel) != 0)
1503  green=ApplyFunction(green, function, number_parameters, parameters);
1504 
1505  if ((channel & BlueChannel) != 0)
1506  blue=ApplyFunction(blue, function, number_parameters, parameters);
1507  }
1508 
1509  if (((number_channels == 4) || (number_channels == 2)) &&
1510  ((channel & AlphaChannel) != 0))
1511  alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1512 
1513  WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1514  }
1515  )
1516 
1517 /*
1518 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1519 % %
1520 % %
1521 % %
1522 % G r a y s c a l e %
1523 % %
1524 % %
1525 % %
1526 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1527 */
1528 
1529  STRINGIFY(
1530  __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1531  const unsigned int colorspace,const unsigned int method)
1532  {
1533  const unsigned int x = get_global_id(0);
1534  const unsigned int y = get_global_id(1);
1535  const unsigned int columns = get_global_size(0);
1536  __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1537 
1538  float
1539  blue,
1540  green,
1541  red;
1542 
1543  red=getPixelRed(p);
1544  green=getPixelGreen(p);
1545  blue=getPixelBlue(p);
1546 
1547  CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1548 
1549  setPixelRed(p,intensity);
1550  setPixelGreen(p,intensity);
1551  setPixelBlue(p,intensity);
1552  }
1553  )
1554 
1555 /*
1556 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1557 % %
1558 % %
1559 % %
1560 % L o c a l C o n t r a s t %
1561 % %
1562 % %
1563 % %
1564 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1565 */
1566 
1567  STRINGIFY(
1568 
1569  __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1570  const int radius,
1571  const int imageWidth,
1572  const int imageHeight)
1573  {
1574  const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1575 
1576  int x = get_local_id(0);
1577  int y = get_global_id(1);
1578 
1579  if ((x >= imageWidth) || (y >= imageHeight))
1580  return;
1581 
1582  global CLPixelType *src = srcImage + y * imageWidth;
1583 
1584  for (int i = x; i < imageWidth; i += get_local_size(0)) {
1585  float sum = 0.0f;
1586  float weight = 1.0f;
1587 
1588  int j = i - radius;
1589  while ((j + 7) < i) {
1590  for (int k = 0; k < 8; ++k) // Unroll 8x
1591  sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1592  weight += 8.0f;
1593  j+=8;
1594  }
1595  while (j < i) {
1596  sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1597  weight += 1.0f;
1598  ++j;
1599  }
1600 
1601  while ((j + 7) < radius + i) {
1602  for (int k = 0; k < 8; ++k) // Unroll 8x
1603  sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1604  weight -= 8.0f;
1605  j+=8;
1606  }
1607  while (j < radius + i) {
1608  sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1609  weight -= 1.0f;
1610  ++j;
1611  }
1612 
1613  tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1614  }
1615  }
1616  )
1617 
1618  STRINGIFY(
1619  __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1620  const int radius,
1621  const float strength,
1622  const int imageWidth,
1623  const int imageHeight)
1624  {
1625  const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1626 
1627  int x = get_global_id(0);
1628  int y = get_global_id(1);
1629 
1630  if ((x >= imageWidth) || (y >= imageHeight))
1631  return;
1632 
1633  global float *src = blurImage + x;
1634 
1635  float sum = 0.0f;
1636  float weight = 1.0f;
1637 
1638  int j = y - radius;
1639  while ((j + 7) < y) {
1640  for (int k = 0; k < 8; ++k) // Unroll 8x
1641  sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1642  weight += 8.0f;
1643  j+=8;
1644  }
1645  while (j < y) {
1646  sum += weight * src[mirrorBottom(j) * imageWidth];
1647  weight += 1.0f;
1648  ++j;
1649  }
1650 
1651  while ((j + 7) < radius + y) {
1652  for (int k = 0; k < 8; ++k) // Unroll 8x
1653  sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1654  weight -= 8.0f;
1655  j+=8;
1656  }
1657  while (j < radius + y) {
1658  sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1659  weight -= 1.0f;
1660  ++j;
1661  }
1662 
1663  CLPixelType pixel = srcImage[x + y * imageWidth];
1664  float srcVal = dot(RGB, convert_float4(pixel));
1665  float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1666  mult = (srcVal + mult) / srcVal;
1667 
1668  pixel.x = ClampToQuantum(pixel.x * mult);
1669  pixel.y = ClampToQuantum(pixel.y * mult);
1670  pixel.z = ClampToQuantum(pixel.z * mult);
1671 
1672  dstImage[x + y * imageWidth] = pixel;
1673  }
1674  )
1675 
1676 /*
1677 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1678 % %
1679 % %
1680 % %
1681 % M o d u l a t e %
1682 % %
1683 % %
1684 % %
1685 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1686 */
1687 
1688  STRINGIFY(
1689 
1690  static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1691  float *hue, float *saturation, float *lightness)
1692  {
1693  float
1694  c,
1695  tmax,
1696  tmin;
1697 
1698  /*
1699  Convert RGB to HSL colorspace.
1700  */
1701  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1702  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1703 
1704  c=tmax-tmin;
1705 
1706  *lightness=(tmax+tmin)/2.0;
1707  if (c <= 0.0)
1708  {
1709  *hue=0.0;
1710  *saturation=0.0;
1711  return;
1712  }
1713 
1714  if (tmax == (QuantumScale*red))
1715  {
1716  *hue=(QuantumScale*green-QuantumScale*blue)/c;
1717  if ((QuantumScale*green) < (QuantumScale*blue))
1718  *hue+=6.0;
1719  }
1720  else
1721  if (tmax == (QuantumScale*green))
1722  *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
1723  else
1724  *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
1725 
1726  *hue*=60.0/360.0;
1727  if (*lightness <= 0.5)
1728  *saturation=c/(2.0*(*lightness));
1729  else
1730  *saturation=c/(2.0-2.0*(*lightness));
1731  }
1732 
1733  static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
1734  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
1735  {
1736  float
1737  b,
1738  c,
1739  g,
1740  h,
1741  tmin,
1742  r,
1743  x;
1744 
1745  /*
1746  Convert HSL to RGB colorspace.
1747  */
1748  h=hue*360.0;
1749  if (lightness <= 0.5)
1750  c=2.0*lightness*saturation;
1751  else
1752  c=(2.0-2.0*lightness)*saturation;
1753  tmin=lightness-0.5*c;
1754  h-=360.0*floor(h/360.0);
1755  h/=60.0;
1756  x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
1757  switch ((int) floor(h) % 6)
1758  {
1759  case 0:
1760  default:
1761  {
1762  r=tmin+c;
1763  g=tmin+x;
1764  b=tmin;
1765  break;
1766  }
1767  case 1:
1768  {
1769  r=tmin+x;
1770  g=tmin+c;
1771  b=tmin;
1772  break;
1773  }
1774  case 2:
1775  {
1776  r=tmin;
1777  g=tmin+c;
1778  b=tmin+x;
1779  break;
1780  }
1781  case 3:
1782  {
1783  r=tmin;
1784  g=tmin+x;
1785  b=tmin+c;
1786  break;
1787  }
1788  case 4:
1789  {
1790  r=tmin+x;
1791  g=tmin;
1792  b=tmin+c;
1793  break;
1794  }
1795  case 5:
1796  {
1797  r=tmin+c;
1798  g=tmin;
1799  b=tmin+x;
1800  break;
1801  }
1802  }
1803  *red=ClampToQuantum(QuantumRange*r);
1804  *green=ClampToQuantum(QuantumRange*g);
1805  *blue=ClampToQuantum(QuantumRange*b);
1806  }
1807 
1808  static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
1809  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
1810  {
1811  float
1812  hue,
1813  lightness,
1814  saturation;
1815 
1816  /*
1817  Increase or decrease color lightness, saturation, or hue.
1818  */
1819  ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
1820  hue+=0.5*(0.01*percent_hue-1.0);
1821  while (hue < 0.0)
1822  hue+=1.0;
1823  while (hue >= 1.0)
1824  hue-=1.0;
1825  saturation*=0.01*percent_saturation;
1826  lightness*=0.01*percent_lightness;
1827  ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
1828  }
1829 
1830  __kernel void Modulate(__global CLPixelType *im,
1831  const float percent_brightness,
1832  const float percent_hue,
1833  const float percent_saturation,
1834  const int colorspace)
1835  {
1836 
1837  const int x = get_global_id(0);
1838  const int y = get_global_id(1);
1839  const int columns = get_global_size(0);
1840  const int c = x + y * columns;
1841 
1842  CLPixelType pixel = im[c];
1843 
1844  CLQuantum
1845  blue,
1846  green,
1847  red;
1848 
1849  red=getRed(pixel);
1850  green=getGreen(pixel);
1851  blue=getBlue(pixel);
1852 
1853  switch (colorspace)
1854  {
1855  case HSLColorspace:
1856  default:
1857  {
1858  ModulateHSL(percent_hue, percent_saturation, percent_brightness,
1859  &red, &green, &blue);
1860  }
1861 
1862  }
1863 
1864  CLPixelType filteredPixel;
1865 
1866  setRed(&filteredPixel, red);
1867  setGreen(&filteredPixel, green);
1868  setBlue(&filteredPixel, blue);
1869  filteredPixel.w = pixel.w;
1870 
1871  im[c] = filteredPixel;
1872  }
1873  )
1874 
1875 /*
1876 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1877 % %
1878 % %
1879 % %
1880 % M o t i o n B l u r %
1881 % %
1882 % %
1883 % %
1884 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1885 */
1886 
1887  STRINGIFY(
1888  __kernel
1889  void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
1890  const unsigned int imageWidth, const unsigned int imageHeight,
1891  const __global float *filter, const unsigned int width, const __global int2* offset,
1892  const float4 bias,
1893  const ChannelType channel, const unsigned int matte) {
1894 
1895  int2 currentPixel;
1896  currentPixel.x = get_global_id(0);
1897  currentPixel.y = get_global_id(1);
1898 
1899  if (currentPixel.x >= imageWidth
1900  || currentPixel.y >= imageHeight)
1901  return;
1902 
1903  float4 pixel;
1904  pixel.x = (float)bias.x;
1905  pixel.y = (float)bias.y;
1906  pixel.z = (float)bias.z;
1907  pixel.w = (float)bias.w;
1908 
1909  if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1910 
1911  for (int i = 0; i < width; i++) {
1912  // only support EdgeVirtualPixelMethod through ClampToCanvas
1913  // TODO: implement other virtual pixel method
1914  int2 samplePixel = currentPixel + offset[i];
1915  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
1916  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
1917  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
1918 
1919  pixel.x += (filter[i] * (float)samplePixelValue.x);
1920  pixel.y += (filter[i] * (float)samplePixelValue.y);
1921  pixel.z += (filter[i] * (float)samplePixelValue.z);
1922  pixel.w += (filter[i] * (float)samplePixelValue.w);
1923  }
1924 
1925  CLPixelType outputPixel;
1926  outputPixel.x = ClampToQuantum(pixel.x);
1927  outputPixel.y = ClampToQuantum(pixel.y);
1928  outputPixel.z = ClampToQuantum(pixel.z);
1929  outputPixel.w = ClampToQuantum(pixel.w);
1930  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
1931  }
1932  else {
1933 
1934  float gamma = 0.0f;
1935  for (int i = 0; i < width; i++) {
1936  // only support EdgeVirtualPixelMethod through ClampToCanvas
1937  // TODO: implement other virtual pixel method
1938  int2 samplePixel = currentPixel + offset[i];
1939  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
1940  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
1941 
1942  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
1943 
1944  float alpha = QuantumScale*samplePixelValue.w;
1945  float k = filter[i];
1946  pixel.x = pixel.x + k * alpha * samplePixelValue.x;
1947  pixel.y = pixel.y + k * alpha * samplePixelValue.y;
1948  pixel.z = pixel.z + k * alpha * samplePixelValue.z;
1949 
1950  pixel.w += k * alpha * samplePixelValue.w;
1951 
1952  gamma+=k*alpha;
1953  }
1954  gamma = PerceptibleReciprocal(gamma);
1955  pixel.xyz = gamma*pixel.xyz;
1956 
1957  CLPixelType outputPixel;
1958  outputPixel.x = ClampToQuantum(pixel.x);
1959  outputPixel.y = ClampToQuantum(pixel.y);
1960  outputPixel.z = ClampToQuantum(pixel.z);
1961  outputPixel.w = ClampToQuantum(pixel.w);
1962  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
1963  }
1964  }
1965  )
1966 
1967 /*
1968 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1969 % %
1970 % %
1971 % %
1972 % R e s i z e %
1973 % %
1974 % %
1975 % %
1976 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1977 */
1978 
1979  STRINGIFY(
1980  // Based on Box from resize.c
1981  float BoxResizeFilter(const float x)
1982  {
1983  return 1.0f;
1984  }
1985  )
1986 
1987  STRINGIFY(
1988  // Based on CubicBC from resize.c
1989  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
1990  {
1991  /*
1992  Cubic Filters using B,C determined values:
1993  Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
1994  Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
1995  Spline B = 1 C = 0 B-Spline Gaussian approximation
1996  Hermite B = 0 C = 0 B-Spline interpolator
1997 
1998  See paper by Mitchell and Netravali, Reconstruction Filters in Computer
1999  Graphics Computer Graphics, Volume 22, Number 4, August 1988
2000  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2001  Mitchell.pdf.
2002 
2003  Coefficients are determined from B,C values:
2004  P0 = ( 6 - 2*B )/6 = coeff[0]
2005  P1 = 0
2006  P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2007  P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2008  Q0 = ( 8*B +24*C )/6 = coeff[3]
2009  Q1 = ( -12*B -48*C )/6 = coeff[4]
2010  Q2 = ( 6*B +30*C )/6 = coeff[5]
2011  Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2012 
2013  which are used to define the filter:
2014 
2015  P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2016  Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2017 
2018  which ensures function is continuous in value and derivative (slope).
2019  */
2020  if (x < 1.0)
2021  return(resizeFilterCoefficients[0]+x*(x*
2022  (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2023  if (x < 2.0)
2024  return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2025  (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2026  return(0.0);
2027  }
2028  )
2029 
2030  STRINGIFY(
2031  float Sinc(const float x)
2032  {
2033  if (x != 0.0f)
2034  {
2035  const float alpha=(float) (MagickPI*x);
2036  return sinpi(x)/alpha;
2037  }
2038  return(1.0f);
2039  }
2040  )
2041 
2042  STRINGIFY(
2043  float Triangle(const float x)
2044  {
2045  /*
2046  1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2047  a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2048  for Sinc().
2049  */
2050  return ((x<1.0f)?(1.0f-x):0.0f);
2051  }
2052  )
2053 
2054 
2055  STRINGIFY(
2056  float Hann(const float x)
2057  {
2058  /*
2059  Cosine window function:
2060  0.5+0.5*cos(pi*x).
2061  */
2062  const float cosine=cos((MagickPI*x));
2063  return(0.5f+0.5f*cosine);
2064  }
2065  )
2066 
2067  STRINGIFY(
2068  float Hamming(const float x)
2069  {
2070  /*
2071  Offset cosine window function:
2072  .54 + .46 cos(pi x).
2073  */
2074  const float cosine=cos((MagickPI*x));
2075  return(0.54f+0.46f*cosine);
2076  }
2077  )
2078 
2079  STRINGIFY(
2080  float Blackman(const float x)
2081  {
2082  /*
2083  Blackman: 2nd order cosine windowing function:
2084  0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2085 
2086  Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2087  five flops.
2088  */
2089  const float cosine=cos((MagickPI*x));
2090  return(0.34f+cosine*(0.5f+cosine*0.16f));
2091  }
2092  )
2093 
2094  STRINGIFY(
2095  static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2096  {
2097  switch (filterType)
2098  {
2099  /* Call Sinc even for SincFast to get better precision on GPU
2100  and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2101  case SincWeightingFunction:
2102  case SincFastWeightingFunction:
2103  return Sinc(x);
2104  case CubicBCWeightingFunction:
2105  return CubicBC(x,filterCoefficients);
2106  case BoxWeightingFunction:
2107  return BoxResizeFilter(x);
2108  case TriangleWeightingFunction:
2109  return Triangle(x);
2110  case HannWeightingFunction:
2111  return Hann(x);
2112  case HammingWeightingFunction:
2113  return Hamming(x);
2114  case BlackmanWeightingFunction:
2115  return Blackman(x);
2116 
2117  default:
2118  return 0.0f;
2119  }
2120  }
2121  )
2122 
2123 
2124  STRINGIFY(
2125  static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2126  , const ResizeWeightingFunctionType resizeWindowType
2127  , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2128  {
2129  float scale;
2130  float xBlur = fabs(x/resizeFilterBlur);
2131  if (resizeWindowSupport < MagickEpsilon
2132  || resizeWindowType == BoxWeightingFunction)
2133  {
2134  scale = 1.0f;
2135  }
2136  else
2137  {
2138  scale = resizeFilterScale;
2139  scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2140  }
2141  float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2142  return weight;
2143  }
2144 
2145  )
2146 
2147  ;
2148  const char *accelerateKernels2 =
2149 
2150  STRINGIFY(
2151 
2152  static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2153  return (numWorkItems/pixelPerWorkgroup);
2154  }
2155 
2156  // returns the index of the pixel for the current workitem to compute.
2157  // returns -1 if this workitem doesn't need to participate in any computation
2158  static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2159  const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2160  int pixelIndex = itemID/numWorkItemsPerPixel;
2161  pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2162  return pixelIndex;
2163  }
2164 
2165  )
2166 
2167  STRINGIFY(
2168  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2169  void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2170  const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2171  const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2172  const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2173  const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2174  const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2175  const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2176  __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2177  {
2178  // calculate the range of resized image pixels computed by this workgroup
2179  const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2180  const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2181  const unsigned int actualNumPixelToCompute = stopX - startX;
2182 
2183  // calculate the range of input image pixels to cache
2184  float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2185  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2186  scale = PerceptibleReciprocal(scale);
2187 
2188  const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2189  const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2190 
2191  // cache the input pixels into local memory
2192  const unsigned int y = get_global_id(1);
2193  const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2194  const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2195  event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2196  wait_group_events(1, &e);
2197 
2198  unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2199  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2200  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2201  {
2202  const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2203  const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2204  const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2205 
2206  // determine which resized pixel computed by this workitem
2207  const unsigned int itemID = get_local_id(0);
2208  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2209 
2210  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2211 
2212  float4 filteredPixel = (float4)0.0f;
2213  float density = 0.0f;
2214  float gamma = 0.0f;
2215  // -1 means this workitem doesn't participate in the computation
2216  if (pixelIndex != -1)
2217  {
2218  // x coordinated of the resized pixel computed by this workitem
2219  const int x = chunkStartX + pixelIndex;
2220 
2221  // calculate how many steps required for this pixel
2222  const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2223  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2224  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2225  const unsigned int n = stop - start;
2226 
2227  // calculate how many steps this workitem will contribute
2228  unsigned int numStepsPerWorkItem = n / numItems;
2229  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2230 
2231  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2232  if (startStep < n)
2233  {
2234  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2235 
2236  unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2237  for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2238  {
2239  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2240  (ResizeWeightingFunctionType) resizeFilterType,
2241  (ResizeWeightingFunctionType) resizeWindowType,
2242  resizeFilterScale, resizeFilterWindowSupport,
2243  resizeFilterBlur, scale*(start + i - bisect + 0.5));
2244 
2245  float4 cp = (float4)0.0f;
2246 
2247  __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2248  cp.x = (float) *(p);
2249  if (number_channels > 2)
2250  {
2251  cp.y = (float) *(p + 1);
2252  cp.z = (float) *(p + 2);
2253  }
2254 
2255  if (alpha_index != 0)
2256  {
2257  cp.w = (float) *(p + alpha_index);
2258 
2259  float alpha = weight * QuantumScale * cp.w;
2260 
2261  filteredPixel.x += alpha * cp.x;
2262  filteredPixel.y += alpha * cp.y;
2263  filteredPixel.z += alpha * cp.z;
2264  filteredPixel.w += weight * cp.w;
2265  gamma += alpha;
2266  }
2267  else
2268  filteredPixel += ((float4) weight)*cp;
2269 
2270  density += weight;
2271  }
2272  }
2273  }
2274 
2275  // initialize the accumulators to zero
2276  if (itemID < actualNumPixelInThisChunk) {
2277  outputPixelCache[itemID] = (float4)0.0f;
2278  densityCache[itemID] = 0.0f;
2279  if (alpha_index != 0)
2280  gammaCache[itemID] = 0.0f;
2281  }
2282  barrier(CLK_LOCAL_MEM_FENCE);
2283 
2284  // accumulate the filtered pixel value and the density
2285  for (unsigned int i = 0; i < numItems; i++) {
2286  if (pixelIndex != -1) {
2287  if (itemID%numItems == i) {
2288  outputPixelCache[pixelIndex]+=filteredPixel;
2289  densityCache[pixelIndex]+=density;
2290  if (alpha_index != 0)
2291  gammaCache[pixelIndex]+=gamma;
2292  }
2293  }
2294  barrier(CLK_LOCAL_MEM_FENCE);
2295  }
2296 
2297  if (itemID < actualNumPixelInThisChunk)
2298  {
2299  float4 filteredPixel = outputPixelCache[itemID];
2300 
2301  float gamma = 0.0f;
2302  if (alpha_index != 0)
2303  gamma = gammaCache[itemID];
2304 
2305  float density = densityCache[itemID];
2306  if ((density != 0.0f) && (density != 1.0f))
2307  {
2308  density = PerceptibleReciprocal(density);
2309  filteredPixel *= (float4) density;
2310  if (alpha_index != 0)
2311  gamma *= density;
2312  }
2313 
2314  if (alpha_index != 0)
2315  {
2316  gamma = PerceptibleReciprocal(gamma);
2317  filteredPixel.x *= gamma;
2318  filteredPixel.y *= gamma;
2319  filteredPixel.z *= gamma;
2320  }
2321 
2322  WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2323  }
2324  }
2325  }
2326  )
2327 
2328 
2329  STRINGIFY(
2330  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2331  void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2332  const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2333  const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2334  const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2335  const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2336  const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2337  const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2338  __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2339  {
2340  // calculate the range of resized image pixels computed by this workgroup
2341  const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2342  const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2343  const unsigned int actualNumPixelToCompute = stopY - startY;
2344 
2345  // calculate the range of input image pixels to cache
2346  float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2347  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2348  scale = PerceptibleReciprocal(scale);
2349 
2350  const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2351  const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2352 
2353  // cache the input pixels into local memory
2354  const unsigned int x = get_global_id(0);
2355  unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2356  unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2357  unsigned int stride = inputColumns * number_channels;
2358  for (unsigned int i = 0; i < number_channels; i++)
2359  {
2360  event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2361  wait_group_events(1,&e);
2362  }
2363 
2364  unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2365  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2366  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2367  {
2368  const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2369  const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2370  const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2371 
2372  // determine which resized pixel computed by this workitem
2373  const unsigned int itemID = get_local_id(1);
2374  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2375 
2376  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2377 
2378  float4 filteredPixel = (float4)0.0f;
2379  float density = 0.0f;
2380  float gamma = 0.0f;
2381  // -1 means this workitem doesn't participate in the computation
2382  if (pixelIndex != -1)
2383  {
2384  // x coordinated of the resized pixel computed by this workitem
2385  const int y = chunkStartY + pixelIndex;
2386 
2387  // calculate how many steps required for this pixel
2388  const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2389  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2390  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2391  const unsigned int n = stop - start;
2392 
2393  // calculate how many steps this workitem will contribute
2394  unsigned int numStepsPerWorkItem = n / numItems;
2395  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2396 
2397  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2398  if (startStep < n)
2399  {
2400  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2401 
2402  unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2403  for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2404  {
2405  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2406  (ResizeWeightingFunctionType) resizeFilterType,
2407  (ResizeWeightingFunctionType) resizeWindowType,
2408  resizeFilterScale, resizeFilterWindowSupport,
2409  resizeFilterBlur, scale*(start + i - bisect + 0.5));
2410 
2411  float4 cp = (float4)0.0f;
2412 
2413  __local CLQuantum *p = inputImageCache + cacheIndex;
2414  cp.x = (float) *(p);
2415  if (number_channels > 2)
2416  {
2417  cp.y = (float) *(p + rangeLength);
2418  cp.z = (float) *(p + (rangeLength * 2));
2419  }
2420 
2421  if (alpha_index != 0)
2422  {
2423  cp.w = (float) *(p + (rangeLength * alpha_index));
2424 
2425  float alpha = weight * QuantumScale * cp.w;
2426 
2427  filteredPixel.x += alpha * cp.x;
2428  filteredPixel.y += alpha * cp.y;
2429  filteredPixel.z += alpha * cp.z;
2430  filteredPixel.w += weight * cp.w;
2431  gamma += alpha;
2432  }
2433  else
2434  filteredPixel += ((float4) weight)*cp;
2435 
2436  density += weight;
2437  }
2438  }
2439  }
2440 
2441  // initialize the accumulators to zero
2442  if (itemID < actualNumPixelInThisChunk) {
2443  outputPixelCache[itemID] = (float4)0.0f;
2444  densityCache[itemID] = 0.0f;
2445  if (alpha_index != 0)
2446  gammaCache[itemID] = 0.0f;
2447  }
2448  barrier(CLK_LOCAL_MEM_FENCE);
2449 
2450  // accumulate the filtered pixel value and the density
2451  for (unsigned int i = 0; i < numItems; i++) {
2452  if (pixelIndex != -1) {
2453  if (itemID%numItems == i) {
2454  outputPixelCache[pixelIndex]+=filteredPixel;
2455  densityCache[pixelIndex]+=density;
2456  if (alpha_index != 0)
2457  gammaCache[pixelIndex]+=gamma;
2458  }
2459  }
2460  barrier(CLK_LOCAL_MEM_FENCE);
2461  }
2462 
2463  if (itemID < actualNumPixelInThisChunk)
2464  {
2465  float4 filteredPixel = outputPixelCache[itemID];
2466 
2467  float gamma = 0.0f;
2468  if (alpha_index != 0)
2469  gamma = gammaCache[itemID];
2470 
2471  float density = densityCache[itemID];
2472  if ((density != 0.0f) && (density != 1.0f))
2473  {
2474  density = PerceptibleReciprocal(density);
2475  filteredPixel *= (float4) density;
2476  if (alpha_index != 0)
2477  gamma *= density;
2478  }
2479 
2480  if (alpha_index != 0)
2481  {
2482  gamma = PerceptibleReciprocal(gamma);
2483  filteredPixel.x *= gamma;
2484  filteredPixel.y *= gamma;
2485  filteredPixel.z *= gamma;
2486  }
2487 
2488  WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2489  }
2490  }
2491  }
2492  )
2493 
2494 /*
2495 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2496 % %
2497 % %
2498 % %
2499 % R o t a t i o n a l B l u r %
2500 % %
2501 % %
2502 % %
2503 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2504 */
2505 
2506  STRINGIFY(
2507  __kernel void RotationalBlur(const __global CLQuantum *image,
2508  const unsigned int number_channels,const unsigned int channel,
2509  const float2 blurCenter,__constant float *cos_theta,
2510  __constant float *sin_theta,const unsigned int cossin_theta_size,
2511  __global CLQuantum *filteredImage)
2512  {
2513  const int x = get_global_id(0);
2514  const int y = get_global_id(1);
2515  const int columns = get_global_size(0);
2516  const int rows = get_global_size(1);
2517  unsigned int step = 1;
2518  float center_x = (float) x - blurCenter.x;
2519  float center_y = (float) y - blurCenter.y;
2520  float radius = hypot(center_x, center_y);
2521 
2522  float blur_radius = hypot(blurCenter.x, blurCenter.y);
2523 
2524  if (radius > MagickEpsilon)
2525  {
2526  step = (unsigned int) (blur_radius / radius);
2527  if (step == 0)
2528  step = 1;
2529  if (step >= cossin_theta_size)
2530  step = cossin_theta_size-1;
2531  }
2532 
2533  float4 result = 0.0f;
2534  float normalize = 0.0f;
2535  float gamma = 0.0f;
2536 
2537  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2538  {
2539  int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2540  int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2541 
2542  float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2543 
2544  if ((number_channels == 4) || (number_channels == 2))
2545  {
2546  float alpha = (float)(QuantumScale*pixel.w);
2547 
2548  gamma += alpha;
2549 
2550  result.x += alpha * pixel.x;
2551  result.y += alpha * pixel.y;
2552  result.z += alpha * pixel.z;
2553  result.w += pixel.w;
2554  }
2555  else
2556  result += pixel;
2557 
2558  normalize += 1.0f;
2559  }
2560 
2561  normalize = PerceptibleReciprocal(normalize);
2562 
2563  if ((number_channels == 4) || (number_channels == 2))
2564  {
2565  gamma = PerceptibleReciprocal(gamma);
2566  result.x *= gamma;
2567  result.y *= gamma;
2568  result.z *= gamma;
2569  result.w *= normalize;
2570  }
2571  else
2572  result *= normalize;
2573 
2574  WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2575  }
2576  )
2577 
2578 /*
2579 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2580 % %
2581 % %
2582 % %
2583 % U n s h a r p M a s k %
2584 % %
2585 % %
2586 % %
2587 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2588 */
2589 
2590  STRINGIFY(
2591  __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2592  const __global float4 *blurRowData,const unsigned int number_channels,
2593  const ChannelType channel,const unsigned int columns,
2594  const unsigned int rows,__local float4* cachedData,
2595  __local float* cachedFilter,const __global float *filter,
2596  const unsigned int width,const float gain, const float threshold,
2597  __global CLQuantum *filteredImage)
2598  {
2599  const unsigned int radius = (width-1)/2;
2600 
2601  // cache the pixel shared by the workgroup
2602  const int groupX = get_group_id(0);
2603  const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2604  const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2605 
2606  if ((groupStartY >= 0) && (groupStopY < rows))
2607  {
2608  event_t e = async_work_group_strided_copy(cachedData,
2609  blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2610  wait_group_events(1,&e);
2611  }
2612  else
2613  {
2614  for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2615  cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2616 
2617  barrier(CLK_LOCAL_MEM_FENCE);
2618  }
2619  // cache the filter as well
2620  event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2621  wait_group_events(1,&e);
2622 
2623  // only do the work if this is not a patched item
2624  const int cy = get_global_id(1);
2625 
2626  if (cy < rows)
2627  {
2628  float4 blurredPixel = (float4) 0.0f;
2629 
2630  int i = 0;
2631 
2632  for ( ; i+7 < width; )
2633  {
2634  for (int j=0; j < 8; j++, i++)
2635  blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2636  }
2637 
2638  for ( ; i < width; i++)
2639  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2640 
2641  float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2642  float4 outputPixel = inputImagePixel - blurredPixel;
2643 
2644  float quantumThreshold = QuantumRange*threshold;
2645 
2646  int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2647  outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2648 
2649  //write back
2650  WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2651  }
2652  }
2653  )
2654 
2655  STRINGIFY(
2656  __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2657  const ChannelType channel,__constant float *filter,const unsigned int width,
2658  const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2659  const float gain,const float threshold,__global CLQuantum *filteredImage)
2660  {
2661  const unsigned int x = get_global_id(0);
2662  const unsigned int y = get_global_id(1);
2663 
2664  const unsigned int radius = (width - 1) / 2;
2665 
2666  int row = y - radius;
2667  int baseRow = get_group_id(1) * get_local_size(1) - radius;
2668  int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2669 
2670  while (row < endRow) {
2671  int srcy = (row < 0) ? -row : row; // mirror pad
2672  srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2673 
2674  float4 value = 0.0f;
2675 
2676  int ix = x - radius;
2677  int i = 0;
2678 
2679  while (i + 7 < width) {
2680  for (int j = 0; j < 8; ++j) { // unrolled
2681  int srcx = ix + j;
2682  srcx = (srcx < 0) ? -srcx : srcx;
2683  srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2684  value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2685  }
2686  ix += 8;
2687  i += 8;
2688  }
2689 
2690  while (i < width) {
2691  int srcx = (ix < 0) ? -ix : ix; // mirror pad
2692  srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2693  value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2694  ++i;
2695  ++ix;
2696  }
2697  pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2698  row += get_local_size(1);
2699  }
2700 
2701  barrier(CLK_LOCAL_MEM_FENCE);
2702 
2703  const int px = get_local_id(0);
2704  const int py = get_local_id(1);
2705  const int prp = get_local_size(0);
2706  float4 value = (float4)(0.0f);
2707 
2708  int i = 0;
2709  while (i + 7 < width) {
2710  for (int j = 0; j < 8; ++j) // unrolled
2711  value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2712  i += 8;
2713  }
2714  while (i < width) {
2715  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2716  ++i;
2717  }
2718 
2719  if ((x < columns) && (y < rows)) {
2720  float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
2721  float4 diff = srcPixel - value;
2722 
2723  float quantumThreshold = QuantumRange*threshold;
2724 
2725  int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
2726  value = select(srcPixel + diff * gain, srcPixel, mask);
2727 
2728  WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
2729  }
2730  }
2731  )
2732 
2733 /*
2734 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2735 % %
2736 % %
2737 % %
2738 % W a v e l e t D e n o i s e %
2739 % %
2740 % %
2741 % %
2742 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2743 */
2744 
2745  STRINGIFY(
2746  __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
2747  void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
2748  const unsigned int number_channels,const unsigned int max_channels,
2749  const float threshold,const int passes,const unsigned int imageWidth,
2750  const unsigned int imageHeight)
2751  {
2752  const int pad = (1 << (passes - 1));
2753  const int tileSize = 64;
2754  const int tileRowPixels = 64;
2755  const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
2756 
2757  CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
2758 
2759  local float buffer[64 * 64];
2760 
2761  int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
2762  int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
2763 
2764  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2765  int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
2766  (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
2767 
2768  for (int channel = 0; channel < max_channels; ++channel)
2769  stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
2770  }
2771 
2772  for (int channel = 0; channel < max_channels; ++channel) {
2773  // Load LDS
2774  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2775  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
2776 
2777  // Process
2778 
2779  float tmp[16];
2780  float accum[16];
2781  float pixel;
2782 
2783  for (int i = 0; i < 16; i++)
2784  accum[i]=0.0f;
2785 
2786  for (int pass = 0; pass < passes; ++pass) {
2787  const int radius = 1 << pass;
2788  const int x = get_local_id(0);
2789  const float thresh = threshold * noise[pass];
2790 
2791  // Apply horizontal hat
2792  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2793  const int offset = i * tileRowPixels;
2794  if (pass == 0)
2795  tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
2796  pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
2797  barrier(CLK_LOCAL_MEM_FENCE);
2798  buffer[x + offset] = pixel;
2799  }
2800  barrier(CLK_LOCAL_MEM_FENCE);
2801 
2802  // Apply vertical hat
2803  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2804  pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
2805  float delta = tmp[i / 4] - pixel;
2806  tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
2807  if (delta < -thresh)
2808  delta += thresh;
2809  else if (delta > thresh)
2810  delta -= thresh;
2811  else
2812  delta = 0;
2813  accum[i / 4] += delta;
2814  }
2815  barrier(CLK_LOCAL_MEM_FENCE);
2816 
2817  if (pass < passes - 1)
2818  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2819  buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
2820  else // last pass
2821  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2822  accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
2823  barrier(CLK_LOCAL_MEM_FENCE);
2824  }
2825 
2826  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
2827  stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
2828 
2829  barrier(CLK_LOCAL_MEM_FENCE);
2830  }
2831 
2832  // Write from stage to output
2833 
2834  if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
2835  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2836  if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
2837  int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
2838  for (int channel = 0; channel < max_channels; ++channel) {
2839  dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
2840  }
2841  }
2842  }
2843  }
2844  }
2845  )
2846 
2847  ;
2848 
2849 #endif // MAGICKCORE_OPENCL_SUPPORT
2850 
2851 #if defined(__cplusplus) || defined(c_plusplus)
2852 }
2853 #endif
2854 
2855 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H