Added SSE and MMX optimization for SDL_FillRect()

--HG-- extra : convert_revision : svn%3Ac70aab31-4412-0410-b14c-859654838e24/trunk%402611

Added SSE and MMX optimization for SDL_FillRect()
--HG-- extra : convert_revision : svn%3Ac70aab31-4412-0410-b14c-859654838e24/trunk%402611
5d8720fd · Sam Lantinga · d7134d38 · 5d8720fd · 5d8720fd · 5d8720fd
Commit 5d8720fd authored Aug 16, 2007 by Sam Lantinga
Showing with 295 additions and 107 deletions

SDL_blit.c src/video/SDL_blit.c +6 -3

SDL_blit.h src/video/SDL_blit.h +15 -0

SDL_blit_copy.c src/video/SDL_blit_copy.c +1 -6

SDL_surface.c src/video/SDL_surface.c +273 -98

No files found.
--- a/src/video/SDL_blit.c
+++ b/src/video/SDL_blit.c
@@ -110,7 +110,8 @@ SDL_SoftBlit(SDL_Surface * src, SDL_Rect * srcrect,
 #ifdef __MACOSX__
 #include <sys/sysctl.h>
-static SDL_bool SDL_UseAltivecPrefetch()
+static SDL_bool
+SDL_UseAltivecPrefetch()
 {
    const char key[] = "hw.l3cachesize";
    u_int64_t result = 0;
@@ -123,14 +124,16 @@ static SDL_bool SDL_UseAltivecPrefetch()
    }
 }
 #else
-static SDL_bool SDL_UseAltivecPrefetch()
+static SDL_bool
+SDL_UseAltivecPrefetch()
 {
    /* Just guess G4 */
    return SDL_TRUE;
 }
 #endif /* __MACOSX__ */
-static SDL_loblit SDL_ChooseBlitFunc(SDL_BlitEntry *entries, int count)
+static SDL_loblit
+SDL_ChooseBlitFunc(SDL_BlitEntry * entries, int count)
 {
    int i;
    static Uint32 features = 0xffffffff;

--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -24,6 +24,13 @@
 #ifndef _SDL_blit_h
 #define _SDL_blit_h
+#ifdef __MMX__
+#include <mmintrin.h>
+#endif
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
 #include "SDL_endian.h"
 /* The structure passed to the low level blit functions */
@@ -92,6 +99,14 @@ extern SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface * surface, int complex);
 * Useful macros for blitting routines
 */
+#if defined(__GNUC__)
+#define DECLARE_ALIGNED(t,v,a)  t __attribute__((aligned(a))) v
+#elif defined(_MSC_VER)
+#define DECLARE_ALIGNED(t,v,a)  t __declspec(align(a)) v
+#else
+#define DECLARE_ALIGNED(t,v,a)  t v
+#endif
 #define FORMAT_EQUAL(A, B)						\
    ((A)->BitsPerPixel == (B)->BitsPerPixel				\
     && ((A)->Rmask == (B)->Rmask) && ((A)->Amask == (B)->Amask))

--- a/src/video/SDL_blit_copy.c
+++ b/src/video/SDL_blit_copy.c
@@ -23,13 +23,8 @@
 #include "SDL_video.h"
 #include "SDL_blit.h"
+#include "SDL_blit_copy.h"
-#ifdef __MMX__
-#include <mmintrin.h>
-#endif
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
 #ifdef __MMX__
 static __inline__ void

--- a/src/video/SDL_surface.c
+++ b/src/video/SDL_surface.c
@@ -509,20 +509,220 @@ SDL_UpperBlit(SDL_Surface * src, SDL_Rect * srcrect,
    return 0;
 }
-static int
+#ifdef __SSE__
-SDL_FillRect1(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
+/* *INDENT-OFF* */
+#define SSE_BEGIN \
+    DECLARE_ALIGNED(Uint32, cccc[4], 16); \
+    cccc[0] = color; \
+    cccc[1] = color; \
+    cccc[2] = color; \
+    cccc[3] = color; \
+    __m128 c128 = *(__m128 *)cccc;
+#define SSE_WORK \
+    for (i = n / 64; i--;) { \
+        _mm_stream_ps((float *)(p+0), c128); \
+        _mm_stream_ps((float *)(p+16), c128); \
+        _mm_stream_ps((float *)(p+32), c128); \
+        _mm_stream_ps((float *)(p+48), c128); \
+        p += 64; \
+    }
+#define SSE_END
+#define DEFINE_SSE_FILLRECT(bpp, type) \
+static void \
+SDL_FillRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
+{ \
+    SSE_BEGIN; \
+ \
+    while (h--) { \
+        int i, n = w * bpp; \
+        Uint8 *p = pixels; \
+ \
+        if (n > 15) { \
+            int adjust = 16 - ((uintptr_t)p & 15); \
+            if (adjust < 16) { \
+                n -= adjust; \
+                adjust /= bpp; \
+                while(adjust--) { \
+                    *((type *)p) = (type)color; \
+                    p += bpp; \
+                } \
+            } \
+            SSE_WORK; \
+        } \
+        if (n & 63) { \
+            int remainder = (n & 63); \
+            remainder /= bpp; \
+            while(remainder--) { \
+                *((type *)p) = (type)color; \
+                p += bpp; \
+            } \
+        } \
+        pixels += pitch; \
+    } \
+ \
+    SSE_END; \
+}
+DEFINE_SSE_FILLRECT(1, Uint8)
+DEFINE_SSE_FILLRECT(2, Uint16)
+DEFINE_SSE_FILLRECT(4, Uint32)
+/* *INDENT-ON* */
+#endif /* __SSE__ */
+#ifdef __MMX__
+/* *INDENT-OFF* */
+#define MMX_BEGIN \
+    __m64 c64 = _mm_set_pi32(color, color)
+#define MMX_WORK \
+    for (i = n / 64; i--;) { \
+        _mm_stream_pi((__m64 *)(p+0), c64); \
+        _mm_stream_pi((__m64 *)(p+8), c64); \
+        _mm_stream_pi((__m64 *)(p+16), c64); \
+        _mm_stream_pi((__m64 *)(p+24), c64); \
+        _mm_stream_pi((__m64 *)(p+32), c64); \
+        _mm_stream_pi((__m64 *)(p+40), c64); \
+        _mm_stream_pi((__m64 *)(p+48), c64); \
+        _mm_stream_pi((__m64 *)(p+56), c64); \
+        p += 64; \
+    }
+#define MMX_END \
+    _mm_empty()
+#define DEFINE_MMX_FILLRECT(bpp, type) \
+static void \
+SDL_FillRect##bpp##MMX(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
+{ \
+    MMX_BEGIN; \
+ \
+    while (h--) { \
+        int i, n = w * bpp; \
+        Uint8 *p = pixels; \
+ \
+        if (n > 7) { \
+            int adjust = 8 - ((uintptr_t)p & 7); \
+            if (adjust < 8) { \
+                n -= adjust; \
+                adjust /= bpp; \
+                while(adjust--) { \
+                    *((type *)p) = (type)color; \
+                    p += bpp; \
+                } \
+            } \
+            MMX_WORK; \
+        } \
+        if (n & 63) { \
+            int remainder = (n & 63); \
+            remainder /= bpp; \
+            while(remainder--) { \
+                *((type *)p) = (type)color; \
+                p += bpp; \
+            } \
+        } \
+        pixels += pitch; \
+    } \
+ \
+    MMX_END; \
+}
+DEFINE_MMX_FILLRECT(1, Uint8)
+DEFINE_MMX_FILLRECT(2, Uint16)
+DEFINE_MMX_FILLRECT(4, Uint32)
+/* *INDENT-ON* */
+#endif /* __MMX__ */
+static void
+SDL_FillRect1(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
 {
-    /* FIXME: We have to worry about packing order.. *sigh* */
+    while (h--) {
-    SDL_SetError("1-bpp rect fill not yet implemented");
+        int n = w;
-    return -1;
+        Uint8 *p = pixels;
+        if (n > 3) {
+            switch ((uintptr_t) p & 3) {
+            case 1:
+                *p++ = (Uint8) color;
+                --n;
+            case 2:
+                *p++ = (Uint8) color;
+                --n;
+            case 3:
+                *p++ = (Uint8) color;
+                --n;
+            }
+            SDL_memset4(p, color, (n >> 2));
+        }
+        if (n & 3) {
+            p += (n & ~3);
+            switch (n & 3) {
+            case 3:
+                *p++ = (Uint8) color;
+            case 2:
+                *p++ = (Uint8) color;
+            case 1:
+                *p++ = (Uint8) color;
+            }
+        }
+        pixels += pitch;
+    }
 }
-static int
+static void
-SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
+SDL_FillRect2(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
+{
+    while (h--) {
+        int n = w;
+        Uint16 *p = (Uint16 *) pixels;
+        if (n > 1) {
+            if ((uintptr_t) p & 2) {
+                *p++ = (Uint16) color;
+                --n;
+            }
+            SDL_memset4(p, color, (n >> 1));
+        }
+        if (n & 1) {
+            p[n - 1] = (Uint16) color;
+        }
+        pixels += pitch;
+    }
+}
+static void
+SDL_FillRect3(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
+{
+    Uint8 r = (Uint8) (color & 0xFF);
+    Uint8 g = (Uint8) ((color >> 8) & 0xFF);
+    Uint8 b = (Uint8) ((color >> 16) & 0xFF);
+    while (h--) {
+        int n = w;
+        Uint8 *p = pixels;
+        while (n--) {
+            *p++ = r;
+            *p++ = g;
+            *p++ = b;
+        }
+        pixels += pitch;
+    }
+}
+static void
+SDL_FillRect4(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
 {
-    /* FIXME: We have to worry about packing order.. *sigh* */
+    while (h--) {
-    SDL_SetError("4-bpp rect fill not yet implemented");
+        SDL_memset4(pixels, color, w);
-    return -1;
+        pixels += pitch;
+    }
 }
 /* 
@@ -531,23 +731,12 @@ SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
 int
 SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
 {
-    int x, y;
+    Uint8 *pixels;
-    Uint8 *row;
    /* This function doesn't work on surfaces < 8 bpp */
    if (dst->format->BitsPerPixel < 8) {
-        switch (dst->format->BitsPerPixel) {
+        SDL_SetError("Fill rect on unsupported surface format");
-        case 1:
+        return (-1);
-            return SDL_FillRect1(dst, dstrect, color);
-            break;
-        case 4:
-            return SDL_FillRect4(dst, dstrect, color);
-            break;
-        default:
-            SDL_SetError("Fill rect on unsupported surface format");
-            return (-1);
-            break;
-        }
    }
    /* If 'dstrect' == NULL, then fill the whole surface */
@@ -564,97 +753,83 @@ SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
    if (SDL_LockSurface(dst) != 0) {
        return (-1);
    }
-    row = (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
+    pixels =
+        (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
        dstrect->x * dst->format->BytesPerPixel;
-    if (dst->format->palette || (color == 0)) {
-        x = dstrect->w * dst->format->BytesPerPixel;
+    switch (dst->format->BytesPerPixel) {
-#ifndef __MACOSX__              /* memset() is optimized on Mac OS X */
+    case 1:
-        if (!color && !((uintptr_t) row & 3) && !(x & 3)
-            && !(dst->pitch & 3)) {
-            int n = x >> 2;
-            for (y = dstrect->h; y; --y) {
-                SDL_memset4(row, 0, n);
-                row += dst->pitch;
-            }
-        } else
-#endif /* !__MACOSX__ */
        {
-            for (y = dstrect->h; y; y--) {
+            color |= (color << 8);
-                SDL_memset(row, color, x);
+            color |= (color << 16);
-                row += dst->pitch;
+#ifdef __SSE__
+            if (SDL_HasSSE()) {
+                SDL_FillRect1SSE(pixels, dst->pitch, color, dstrect->w,
+                                 dstrect->h);
+                break;
            }
-        }
+#endif
-    } else {
+#ifdef __MMX__
-        switch (dst->format->BytesPerPixel) {
+            if (SDL_HasMMX()) {
-        case 2:
+                SDL_FillRect1MMX(pixels, dst->pitch, color, dstrect->w,
-            {
+                                 dstrect->h);
-                Uint16 c = (Uint16) color;
+                break;
-                Uint32 cc = (Uint32) c << 16 | c;
-                for (y = dstrect->h; y; --y) {
-                    Uint16 *pixels = (Uint16 *) row;
-                    int n = dstrect->w;
-                    if ((uintptr_t) pixels & 3) {
-                        *pixels++ = c;
-                        n--;
-                    }
-                    if (n >> 1)
-                        SDL_memset4(pixels, cc, n >> 1);
-                    if (n & 1)
-                        pixels[n - 1] = c;
-                    row += dst->pitch;
-                }
            }
+#endif
+            SDL_FillRect1(pixels, dst->pitch, color, dstrect->w, dstrect->h);
            break;
+        }
-        case 3:
+    case 2:
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+        {
-            color <<= 8;
+            color |= (color << 16);
+#ifdef __SSE__
+            if (SDL_HasSSE()) {
+                SDL_FillRect2SSE(pixels, dst->pitch, color, dstrect->w,
+                                 dstrect->h);
+                break;
+            }
 #endif
-            for (y = dstrect->h; y; --y) {
+#ifdef __MMX__
-                Uint8 *pixels = row;
+            if (SDL_HasMMX()) {
-                for (x = dstrect->w; x; --x) {
+                SDL_FillRect2MMX(pixels, dst->pitch, color, dstrect->w,
-                    SDL_memcpy(pixels, &color, 3);
+                                 dstrect->h);
-                    pixels += 3;
+                break;
-                }
-                row += dst->pitch;
            }
+#endif
+            SDL_FillRect2(pixels, dst->pitch, color, dstrect->w, dstrect->h);
+            break;
+        }
+    case 3:
+        /* 24-bit RGB is a slow path, at least for now. */
+        {
+            SDL_FillRect3(pixels, dst->pitch, color, dstrect->w, dstrect->h);
            break;
+        }
-        case 4:
+    case 4:
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
+        {
-            if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) {
+#ifdef __SSE__
-                Uint32 cccc[4] __attribute__ ((aligned(16))) = {
+            if (SDL_HasSSE()) {
-                color, color, color, color};
+                SDL_FillRect4SSE(pixels, dst->pitch, color, dstrect->w,
-                int i, n = dstrect->w / 4;
+                                 dstrect->h);
-                __asm__ __volatile__("	movdqa (%0), %%xmm0\n"::
-                                     "r"(cccc):"memory");
-                for (y = dstrect->h; y; --y) {
-                    Uint8 *pixels = row;
-                    for (i = n / 2; i--;) {
-                        /* *INDENT-OFF* */
-                        __asm__ __volatile__("	prefetchnta 256(%0)\n"
-                                             "	movdqa %%xmm0, (%0)\n"
-                                             "	movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory");
-                        /* *INDENT-ON* */
-                        pixels += 32;
-                    }
-                    if (n & 1) {
-                        __asm__ __volatile__("	movdqa %%xmm0, (%0)\n"::
-                                             "r"(pixels):"memory");
-                    }
-                    row += dst->pitch;
-                }
-                __asm__ __volatile__("	emms\n"::);
                break;
            }
 #endif
-            for (y = dstrect->h; y; --y) {
+#ifdef __MMX__
-                SDL_memset4(row, color, dstrect->w);
+            if (SDL_HasMMX()) {
-                row += dst->pitch;
+                SDL_FillRect4MMX(pixels, dst->pitch, color, dstrect->w,
+                                 dstrect->h);
+                break;
            }
+#endif
+            SDL_FillRect4(pixels, dst->pitch, color, dstrect->w, dstrect->h);
            break;
        }
    }
    SDL_UnlockSurface(dst);
    /* We're done! */