21 #include "../SDL_internal.h"
45 const unsigned A = info->
a;
60 if ( palmap ==
NULL ) {
61 *
dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
63 *
dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
90 unsigned sR, sG, sB, sA;
106 if ( palmap ==
NULL ) {
107 *
dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
109 *
dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
139 const unsigned A = info->
a;
146 if ( Pixel != ckey ) {
155 if ( palmap ==
NULL ) {
156 *
dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
158 *
dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
185 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
187 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);
188 lmask = _mm_set_pi32(0x00010101, 0x00010101);
189 dsta = _mm_set_pi32(dalpha, dalpha);
196 *dstp++ = ((((
s & 0x00fefefe) + (
d & 0x00fefefe)) >> 1)
197 + (
s &
d & 0x00010101)) | dalpha;
201 for (
n >>= 1;
n > 0; --
n) {
202 dst1 = *(__m64 *) dstp;
205 src1 = *(__m64 *) srcp;
208 dst2 = _mm_and_si64(dst2, hmask);
209 src2 = _mm_and_si64(src2, hmask);
210 src2 = _mm_add_pi32(src2, dst2);
211 src2 = _mm_srli_pi32(src2, 1);
213 dst1 = _mm_and_si64(dst1, src1);
214 dst1 = _mm_and_si64(dst1, lmask);
215 dst1 = _mm_add_pi32(dst1, src2);
216 dst1 = _mm_or_si64(dst1, dsta);
218 *(__m64 *) dstp = dst1;
239 BlitRGBtoRGBSurfaceAlpha128MMX(info);
250 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
252 mm_zero = _mm_setzero_si64();
255 amult = amult | (amult << 16);
257 (0xff << df->
Rshift) | (0xff << df->
258 Gshift) | (0xff << df->
Bshift);
259 mm_alpha = _mm_set_pi32(0, amult & chanmask);
260 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero);
262 dsta = _mm_set_pi32(dalpha, dalpha);
268 src2 = _mm_cvtsi32_si64(*srcp);
269 src2 = _mm_unpacklo_pi8(src2, mm_zero);
271 dst1 = _mm_cvtsi32_si64(*dstp);
272 dst1 = _mm_unpacklo_pi8(dst1, mm_zero);
274 src2 = _mm_sub_pi16(src2, dst1);
275 src2 = _mm_mullo_pi16(src2, mm_alpha);
276 src2 = _mm_srli_pi16(src2, 8);
277 dst1 = _mm_add_pi8(src2, dst1);
279 dst1 = _mm_packs_pu16(dst1, mm_zero);
280 dst1 = _mm_or_si64(dst1, dsta);
281 *dstp = _mm_cvtsi64_si32(dst1);
289 for (
n >>= 1;
n > 0; --
n) {
291 src1 = *(__m64 *) srcp;
293 src1 = _mm_unpacklo_pi8(src1, mm_zero);
294 src2 = _mm_unpackhi_pi8(src2, mm_zero);
296 dst1 = *(__m64 *) dstp;
298 dst1 = _mm_unpacklo_pi8(dst1, mm_zero);
299 dst2 = _mm_unpackhi_pi8(dst2, mm_zero);
301 src1 = _mm_sub_pi16(src1, dst1);
302 src1 = _mm_mullo_pi16(src1, mm_alpha);
303 src1 = _mm_srli_pi16(src1, 8);
304 dst1 = _mm_add_pi8(src1, dst1);
306 src2 = _mm_sub_pi16(src2, dst2);
307 src2 = _mm_mullo_pi16(src2, mm_alpha);
308 src2 = _mm_srli_pi16(src2, 8);
309 dst2 = _mm_add_pi8(src2, dst2);
311 dst1 = _mm_packs_pu16(dst1, dst2);
312 dst1 = _mm_or_si64(dst1, dsta);
314 *(__m64 *) dstp = dst1;
339 Uint64 multmask, multmask2;
341 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
343 mm_zero = _mm_setzero_si64();
345 multmask <<= (ashift * 2);
346 multmask2 = 0x00FF00FF00FF00FFULL;
354 }
else if (
alpha == amask) {
357 src1 = _mm_cvtsi32_si64(*srcp);
358 src1 = _mm_unpacklo_pi8(src1, mm_zero);
360 dst1 = _mm_cvtsi32_si64(*dstp);
361 dst1 = _mm_unpacklo_pi8(dst1, mm_zero);
363 mm_alpha = _mm_cvtsi32_si64(
alpha);
364 mm_alpha = _mm_srli_si64(mm_alpha, ashift);
365 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);
366 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
367 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);
368 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);
371 src1 = _mm_mullo_pi16(src1, mm_alpha);
372 src1 = _mm_srli_pi16(src1, 8);
373 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
374 dst1 = _mm_srli_pi16(dst1, 8);
375 dst1 = _mm_add_pi16(src1, dst1);
376 dst1 = _mm_packs_pu16(dst1, mm_zero);
378 *dstp = _mm_cvtsi64_si32(dst1);
408 *dstp++ = ((((
s & 0x00fefefe) + (
d & 0x00fefefe)) >> 1)
409 + (
s &
d & 0x00010101)) | 0xff000000;
443 d1 = (d1 + ((
s1 - d1) *
alpha >> 8))
447 d = (
d + ((
s -
d) *
alpha >> 8)) & 0xff00;
448 *dstp = d1 |
d | 0xff000000;
495 d1 = (d1 + ((
s1 - d1) *
alpha >> 8)) & 0xff00ff;
498 d = (
d + ((
s -
d) *
alpha >> 8)) & 0xff00;
499 dalpha =
alpha + (dalpha * (
alpha ^ 0xFF) >> 8);
500 *dstp = d1 |
d | (dalpha << 24);
526 Uint64 multmask, multmask2;
528 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
530 mm_zero = _mm_setzero_si64();
532 multmask <<= (ashift * 2);
533 multmask2 = 0x00FF00FF00FF00FFULL;
540 _m_prefetch(srcp + 16);
541 _m_prefetch(dstp + 16);
543 alpha = *srcp & amask;
546 }
else if (
alpha == amask) {
549 src1 = _mm_cvtsi32_si64(*srcp);
550 src1 = _mm_unpacklo_pi8(src1, mm_zero);
552 dst1 = _mm_cvtsi32_si64(*dstp);
553 dst1 = _mm_unpacklo_pi8(dst1, mm_zero);
555 mm_alpha = _mm_cvtsi32_si64(
alpha);
556 mm_alpha = _mm_srli_si64(mm_alpha, ashift);
557 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);
558 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
559 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);
560 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);
564 src1 = _mm_mullo_pi16(src1, mm_alpha);
565 src1 = _mm_srli_pi16(src1, 8);
566 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
567 dst1 = _mm_srli_pi16(dst1, 8);
568 dst1 = _mm_add_pi16(src1, dst1);
569 dst1 = _mm_packs_pu16(dst1, mm_zero);
571 *dstp = _mm_cvtsi64_si32(dst1);
588 #define BLEND16_50(d, s, mask) \
589 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
592 #define BLEND2x16_50(d, s, mask) \
593 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
594 + (s & d & (~(mask | mask << 16))))
627 prev_sw = ((
Uint32 *) srcp)[-1];
633 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
634 s = (prev_sw << 16) + (sw >> 16);
636 s = (prev_sw >> 16) + (sw << 16);
648 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
713 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
715 alpha &= ~(1 + 2 + 4);
716 mm_alpha = _mm_set_pi32(0,
alpha);
719 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);
720 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
723 mm_alpha = _mm_slli_si64(mm_alpha, 3);
726 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);
727 bmask = _mm_set_pi32(0x001F001F, 0x001F001F);
740 s = (
s |
s << 16) & 0x07e0f81f;
741 d = (
d |
d << 16) & 0x07e0f81f;
753 s = (
s |
s << 16) & 0x07e0f81f;
754 d = (
d |
d << 16) & 0x07e0f81f;
765 s = (
s |
s << 16) & 0x07e0f81f;
766 d = (
d |
d << 16) & 0x07e0f81f;
771 src1 = *(__m64*)srcp;
772 dst1 = *(__m64*)dstp;
776 src2 = _mm_srli_pi16(src2, 11);
779 dst2 = _mm_srli_pi16(dst2, 11);
782 src2 = _mm_sub_pi16(src2, dst2);
783 src2 = _mm_mullo_pi16(src2, mm_alpha);
784 src2 = _mm_srli_pi16(src2, 11);
785 dst2 = _mm_add_pi16(src2, dst2);
786 dst2 = _mm_slli_pi16(dst2, 11);
792 src2 = _mm_and_si64(src2, gmask);
795 dst2 = _mm_and_si64(dst2, gmask);
798 src2 = _mm_sub_pi16(src2, dst2);
799 src2 = _mm_mulhi_pi16(src2, mm_alpha);
800 src2 = _mm_slli_pi16(src2, 5);
801 dst2 = _mm_add_pi16(src2, dst2);
803 mm_res = _mm_or_si64(mm_res, dst2);
807 src2 = _mm_and_si64(src2, bmask);
810 dst2 = _mm_and_si64(dst2, bmask);
813 src2 = _mm_sub_pi16(src2, dst2);
814 src2 = _mm_mullo_pi16(src2, mm_alpha);
815 src2 = _mm_srli_pi16(src2, 11);
816 dst2 = _mm_add_pi16(src2, dst2);
817 dst2 = _mm_and_si64(dst2, bmask);
819 mm_res = _mm_or_si64(mm_res, dst2);
821 *(__m64*)dstp = mm_res;
850 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
852 alpha &= ~(1 + 2 + 4);
853 mm_alpha = _mm_set_pi32(0,
alpha);
856 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);
857 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);
860 mm_alpha = _mm_slli_si64(mm_alpha, 3);
863 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);
864 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);
865 bmask = _mm_set_pi32(0x001F001F, 0x001F001F);
878 s = (
s |
s << 16) & 0x03e07c1f;
879 d = (
d |
d << 16) & 0x03e07c1f;
891 s = (
s |
s << 16) & 0x03e07c1f;
892 d = (
d |
d << 16) & 0x03e07c1f;
903 s = (
s |
s << 16) & 0x03e07c1f;
904 d = (
d |
d << 16) & 0x03e07c1f;
909 src1 = *(__m64*)srcp;
910 dst1 = *(__m64*)dstp;
914 src2 = _mm_and_si64(src2, rmask);
917 dst2 = _mm_and_si64(dst2, rmask);
920 src2 = _mm_sub_pi16(src2, dst2);
921 src2 = _mm_mulhi_pi16(src2, mm_alpha);
922 src2 = _mm_slli_pi16(src2, 5);
923 dst2 = _mm_add_pi16(src2, dst2);
924 dst2 = _mm_and_si64(dst2, rmask);
930 src2 = _mm_and_si64(src2, gmask);
933 dst2 = _mm_and_si64(dst2, gmask);
936 src2 = _mm_sub_pi16(src2, dst2);
937 src2 = _mm_mulhi_pi16(src2, mm_alpha);
938 src2 = _mm_slli_pi16(src2, 5);
939 dst2 = _mm_add_pi16(src2, dst2);
941 mm_res = _mm_or_si64(mm_res, dst2);
945 src2 = _mm_and_si64(src2, bmask);
948 dst2 = _mm_and_si64(dst2, bmask);
951 src2 = _mm_sub_pi16(src2, dst2);
952 src2 = _mm_mullo_pi16(src2, mm_alpha);
953 src2 = _mm_srli_pi16(src2, 11);
954 dst2 = _mm_add_pi16(src2, dst2);
955 dst2 = _mm_and_si64(dst2, bmask);
957 mm_res = _mm_or_si64(mm_res, dst2);
959 *(__m64*)dstp = mm_res;
1000 s = (
s |
s << 16) & 0x07e0f81f;
1001 d = (
d |
d << 16) & 0x07e0f81f;
1017 unsigned alpha = info->
a;
1039 s = (
s |
s << 16) & 0x03e07c1f;
1040 d = (
d |
d << 16) & 0x03e07c1f;
1067 unsigned alpha =
s >> 27;
1074 *dstp = (
Uint16)((
s >> 8 & 0xf800) + (
s >> 5 & 0x7e0) + (
s >> 3 & 0x1f));
1081 s = ((
s & 0xfc00) << 11) + (
s >> 8 & 0xf800)
1083 d = (
d |
d << 16) & 0x07e0f81f;
1121 *dstp = (
Uint16)((
s >> 9 & 0x7c00) + (
s >> 6 & 0x3e0) + (
s >> 3 & 0x1f));
1128 s = ((
s & 0xf800) << 10) + (
s >> 9 & 0x7c00)
1130 d = (
d |
d << 16) & 0x03e07c1f;
1160 unsigned sR, sG, sB;
1161 unsigned dR, dG, dB, dA;
1162 const unsigned sA = info->
a;
1200 unsigned sR, sG, sB;
1201 unsigned dR, dG, dB, dA;
1202 const unsigned sA = info->
a;
1209 if(sA && Pixel != ckey) {
1240 unsigned sR, sG, sB, sA;
1241 unsigned dR, dG, dB, dA;
1288 && sf->
Gmask == 0xff00
1290 || (sf->
Bmask == 0xff && df->
Bmask == 0x1f))) {
1291 if (df->
Gmask == 0x7e0)
1293 else if (df->
Gmask == 0x3e0)
1302 #if defined(__MMX__) || defined(__3dNOW__)
1309 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1313 return BlitRGBtoRGBPixelAlphaMMX;
1317 if (sf->
Amask == 0xff000000) {
1330 if (sf->
Amask == 0) {
1343 if (df->
Gmask == 0x7e0) {
1346 return Blit565to565SurfaceAlphaMMX;
1350 }
else if (df->
Gmask == 0x3e0) {
1353 return Blit555to555SurfaceAlphaMMX;
1369 return BlitRGBtoRGBSurfaceAlphaMMX;
1385 if (sf->
Amask == 0) {