Removed hermes since it's LGPL and not compatible with a commercial license.

Prepping for using MMX and SSE intrinsics instead of inline assembly. .. except for memcpy equivalents which only get faster if they can exploit the parallelism of loading into multiple SIMD registers. :) --HG-- extra : convert_revision : svn%3Ac70aab31-4412-0410-b14c-859654838e24/trunk%402609

Removed hermes since it's LGPL and not compatible with a commercial license.
Prepping for using MMX and SSE intrinsics instead of inline assembly. .. except for memcpy equivalents which only get faster if they can exploit the parallelism of loading into multiple SIMD registers. :) --HG-- extra : convert_revision : svn%3Ac70aab31-4412-0410-b14c-859654838e24/trunk%402609
37fe3a93 · Sam Lantinga · 92c5ea48 · 37fe3a93 · 37fe3a93 · 37fe3a93
Commit 37fe3a93 authored Aug 15, 2007 by Sam Lantinga
18 changed files
--- a/build-scripts/makedep.sh
+++ b/build-scripts/makedep.sh
@@ -65,12 +65,6 @@ __EOF__

 	\$(LIBTOOL) --mode=compile \$(CC) \$(CFLAGS) \$(EXTRA_CFLAGS) -c $src  -o \$@

-__EOF__
-        ;;
-        asm) cat >>${output}.new <<__EOF__
-
-	\$(LIBTOOL) --tag=CC --mode=compile \$(auxdir)/strip_fPIC.sh \$(NASM) $src -o \$@
-
 __EOF__
        ;;
        S) cat >>${output}.new <<__EOF__

--- a/configure.in
+++ b/configure.in
--- a/include/SDL_config.h.in
+++ b/include/SDL_config.h.in
@@ -292,7 +292,6 @@

 /* Enable assembly routines */
 #undef SDL_ASSEMBLY_ROUTINES
-#undef SDL_HERMES_BLITTERS
 #undef SDL_ALTIVEC_BLITTERS

 #endif /* _SDL_config_h */
--- a/src/hermes/COPYING.LIB
+++ b/src/hermes/COPYING.LIB
--- a/src/hermes/HeadMMX.h
+++ b/src/hermes/HeadMMX.h
-/*
-   Header definitions for the MMX routines for the HERMES library
-   Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
-   This source code is licensed under the GNU LGPL
-  
-   Please refer to the file COPYING.LIB contained in the distribution for
-   licensing conditions
-*/
-#include "SDL_config.h"
-
-#ifndef __HERMES_HEAD_MMX__
-#define __HERMES_HEAD_MMX__
-
-
-/* If you cannot stand ifdefs, then please do not look into this file, it's
-   going to end your life :) */
-
-#ifdef X86_ASSEMBLER
-
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-    void STACKCALL ConvertMMX(HermesConverterInterface *);
-
-    void STACKCALL ClearMMX_32(HermesClearInterface *);
-    void STACKCALL ClearMMX_24(HermesClearInterface *);
-    void STACKCALL ClearMMX_16(HermesClearInterface *);
-    void STACKCALL ClearMMX_8(HermesClearInterface *);
-
-    void ConvertMMXpII32_24RGB888();
-    void ConvertMMXpII32_16RGB565();
-    void ConvertMMXpII32_16BGR565();
-    void ConvertMMXpII32_16RGB555();
-    void ConvertMMXpII32_16BGR565();
-    void ConvertMMXpII32_16BGR555();
-
-    void ConvertMMXp32_16RGB555();
-
-#ifdef __cplusplus
-}
-#endif
-
-
-
-/* Fix the underscore business with ELF compilers */
-
-#if defined(__ELF__) && defined(__GNUC__)
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-    extern void _ConvertMMX(HermesConverterInterface *);
-    extern void _ConvertMMXpII32_24RGB888();
-    extern void _ConvertMMXpII32_16RGB565();
-    extern void _ConvertMMXpII32_16BGR565();
-    extern void _ConvertMMXpII32_16RGB555();
-    extern void _ConvertMMXpII32_16BGR555();
-
-#define ConvertMMX _ConvertMMX
-#define ConvertMMXpII32_24RGB888 _ConvertMMXpII32_24RGB888
-#define ConvertMMXpII32_16RGB565 _ConvertMMXpII32_16RGB565
-#define ConvertMMXpII32_16BGR565 _ConvertMMXpII32_16BGR565
-#define ConvertMMXpII32_16RGB555 _ConvertMMXpII32_16RGB555
-#define ConvertMMXpII32_16BGR555 _ConvertMMXpII32_16BGR555
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif                          /* ELF and GNUC */
-
-
-
-
-/* Make it work with Watcom */
-#ifdef __WATCOMC__
-#pragma warning 601 9
-
-#pragma aux ConvertMMX "_*" modify [EAX EBX ECX EDX ESI EDI]
-
-#pragma aux ClearMMX_32 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearMMX_24 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearMMX_16 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearMMX_8 "_*" modify [EAX EBX ECX EDX ESI EDI]
-
-#pragma aux ConvertMMXpII32_24RGB888 "_*"
-#pragma aux ConvertMMXpII32_16RGB565 "_*"
-#pragma aux ConvertMMXpII32_16BGR565 "_*"
-#pragma aux ConvertMMXpII32_16RGB555 "_*"
-#pragma aux ConvertMMXpII32_16BGR555 "_*"
-#pragma aux ConvertMMXp32_16RGB555 "_*"
-
-#endif                          /* WATCOM */
-
-#endif                          /* X86_ASSEMBLER */
-
-
-#endif
-/* vi: set ts=4 sw=4 expandtab: */
--- a/src/hermes/HeadX86.h
+++ b/src/hermes/HeadX86.h
-/*
-   Header definitions for the x86 routines for the HERMES library
-   Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
-   This source code is licensed under the GNU LGPL
-  
-   Please refer to the file COPYING.LIB contained in the distribution for
-   licensing conditions
-*/
-
-#ifndef __HERMES_HEAD_X86__
-#define __HERMES_HEAD_X86__
-
-
-#ifdef X86_ASSEMBLER
-
-/* If you can't stand IFDEFS, then close your eyes now, please :) */
-
-/* Ok, we start with normal function definitions */
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-
-    void STACKCALL ConvertX86(HermesConverterInterface *);
-    void STACKCALL ClearX86_32(HermesClearInterface *);
-    void STACKCALL ClearX86_24(HermesClearInterface *);
-    void STACKCALL ClearX86_16(HermesClearInterface *);
-    void STACKCALL ClearX86_8(HermesClearInterface *);
-
-    int STACKCALL Hermes_X86_CPU();
-
-    void ConvertX86p32_32BGR888();
-    void ConvertX86p32_32RGBA888();
-    void ConvertX86p32_32BGRA888();
-    void ConvertX86p32_24RGB888();
-    void ConvertX86p32_24BGR888();
-    void ConvertX86p32_16RGB565();
-    void ConvertX86p32_16BGR565();
-    void ConvertX86p32_16RGB555();
-    void ConvertX86p32_16BGR555();
-    void ConvertX86p32_8RGB332();
-
-    void ConvertX86p16_32RGB888();
-    void ConvertX86p16_32BGR888();
-    void ConvertX86p16_32RGBA888();
-    void ConvertX86p16_32BGRA888();
-    void ConvertX86p16_24RGB888();
-    void ConvertX86p16_24BGR888();
-    void ConvertX86p16_16BGR565();
-    void ConvertX86p16_16RGB555();
-    void ConvertX86p16_16BGR555();
-    void ConvertX86p16_8RGB332();
-
-    void CopyX86p_4byte();
-    void CopyX86p_3byte();
-    void CopyX86p_2byte();
-    void CopyX86p_1byte();
-
-    void ConvertX86pI8_32();
-    void ConvertX86pI8_24();
-    void ConvertX86pI8_16();
-
-    extern int ConvertX86p16_32RGB888_LUT_X86[512];
-    extern int ConvertX86p16_32BGR888_LUT_X86[512];
-    extern int ConvertX86p16_32RGBA888_LUT_X86[512];
-    extern int ConvertX86p16_32BGRA888_LUT_X86[512];
-
-#ifdef __cplusplus
-}
-#endif
-
-
-
-
-/* Now fix up the ELF underscore problem */
-
-#if defined(__ELF__) && defined(__GNUC__)
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-    extern int _Hermes_X86_CPU();
-
-    extern void _ConvertX86(HermesConverterInterface *);
-
-    extern void _ConvertX86p32_32BGR888();
-    extern void _ConvertX86p32_32RGBA888();
-    extern void _ConvertX86p32_32BGRA888();
-    extern void _ConvertX86p32_24RGB888();
-    extern void _ConvertX86p32_24BGR888();
-    extern void _ConvertX86p32_16RGB565();
-    extern void _ConvertX86p32_16BGR565();
-    extern void _ConvertX86p32_16RGB555();
-    extern void _ConvertX86p32_16BGR555();
-    extern void _ConvertX86p32_8RGB332();
-
-    extern void _ConvertX86p16_16BGR565();
-    extern void _ConvertX86p16_16RGB555();
-    extern void _ConvertX86p16_16BGR555();
-    extern void _ConvertX86p16_8RGB332();
-
-
-#define Hermes_X86_CPU _Hermes_X86_CPU
-
-#define ConvertX86 _ConvertX86
-
-#define ConvertX86p32_32BGR888 _ConvertX86p32_32BGR888
-#define ConvertX86p32_32RGBA888 _ConvertX86p32_32RGBA888
-#define ConvertX86p32_32BGRA888 _ConvertX86p32_32BGRA888
-#define ConvertX86p32_24RGB888 _ConvertX86p32_24RGB888
-#define ConvertX86p32_24BGR888 _ConvertX86p32_24BGR888
-#define ConvertX86p32_16RGB565 _ConvertX86p32_16RGB565
-#define ConvertX86p32_16BGR565 _ConvertX86p32_16BGR565
-#define ConvertX86p32_16RGB555 _ConvertX86p32_16RGB555
-#define ConvertX86p32_16BGR555 _ConvertX86p32_16BGR555
-#define ConvertX86p32_8RGB332 _ConvertX86p32_8RGB332
-
-#define ConvertX86p16_16BGR565 _ConvertX86p16_16BGR565
-#define ConvertX86p16_16RGB555 _ConvertX86p16_16RGB555
-#define ConvertX86p16_16BGR555 _ConvertX86p16_16BGR555
-#define ConvertX86p16_8RGB332 _ConvertX86p16_8RGB332
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif                          /* ELF & GNU */
-
-
-
-/* Make it run with WATCOM C */
-#ifdef __WATCOMC__
-#pragma warning 601 9
-
-#pragma aux Hermes_X86_CPU "_*"
-
-#pragma aux ConvertX86 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearX86_32 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearX86_24 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearX86_16 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearX86_8 "_*" modify [EAX EBX ECX EDX ESI EDI]
-
-#pragma aux ConvertX86p32_32BGR888 "_*"
-#pragma aux ConvertX86p32_32RGBA888 "_*"
-#pragma aux ConvertX86p32_32BGRA888 "_*"
-#pragma aux ConvertX86p32_24RGB888 "_*"
-#pragma aux ConvertX86p32_24BGR888 "_*"
-#pragma aux ConvertX86p32_16RGB565 "_*"
-#pragma aux ConvertX86p32_16BGR565 "_*"
-#pragma aux ConvertX86p32_16RGB555 "_*"
-#pragma aux ConvertX86p32_16BGR555 "_*"
-#pragma aux ConvertX86p32_8RGB332 "_*"
-
-#pragma aux ConvertX86p16_32RGB888 "_*"
-#pragma aux ConvertX86p16_32BGR888 "_*"
-#pragma aux ConvertX86p16_32RGBA888 "_*"
-#pragma aux ConvertX86p16_32BGRA888 "_*"
-#pragma aux ConvertX86p16_24RGB888 "_*"
-#pragma aux ConvertX86p16_24BGR888 "_*"
-#pragma aux ConvertX86p16_16BGR565 "_*"
-#pragma aux ConvertX86p16_16RGB555 "_*"
-#pragma aux ConvertX86p16_16BGR555 "_*"
-#pragma aux ConvertX86p16_8RGB332 "_*"
-
-#pragma aux CopyX86p_4byte "_*"
-#pragma aux CopyX86p_3byte "_*"
-#pragma aux CopyX86p_2byte "_*"
-#pragma aux CopyX86p_1byte "_*"
-
-#pragma aux ConvertX86pI8_32 "_*"
-#pragma aux ConvertX86pI8_24 "_*"
-#pragma aux ConvertX86pI8_16 "_*"
-
-#pragma aux ConvertX86p16_32RGB888_LUT_X86 "_*"
-#pragma aux ConvertX86p16_32BGR888_LUT_X86 "_*"
-#pragma aux ConvertX86p16_32RGBA888_LUT_X86 "_*"
-#pragma aux ConvertX86p16_32BGRA888_LUT_X86 "_*"
-
-#endif                          /* __WATCOMC__ */
-
-
-#endif                          /* X86_ASSEMBLER */
-
-
-#endif
-
-/* vi: set ts=4 sw=4 expandtab: */
--- a/src/hermes/README
+++ b/src/hermes/README
-HERMES 1.2.4 (c)1998 Christian Nentwich (brn) (c.nentwich@cs.ucl.ac.uk)
-and quite a few assembler routines (c) Glenn Fielder (gaffer@gaffer.org)
-
-This library and all the files enclosed in this package are free software
-under the terms of the GNU Library General Public License (LGPL). Please
-refer to the included file COPYING.LIB for the exact terms.
----------------------------------------------------------------------------
-
-This is a stripped down version of HERMES, including only the x86 assembler
-converters, for use with Simple DirectMedia Layer.
-
-The full HERMES library is available at:  http://hermes.terminal.at/
-
--- a/src/hermes/common.inc
+++ b/src/hermes/common.inc
-; Some common macros for hermes nasm code
-
-%macro SDL_FUNC 1
-%ifdef HIDDEN_VISIBILITY
-GLOBAL %1:function hidden
-%else
-GLOBAL %1
-%endif
-%endmacro
--- a/src/hermes/mmx_main.asm
+++ b/src/hermes/mmx_main.asm
-;
-; mmx format converter main loops for HERMES
-; Some routines Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
-; This source code is licensed under the GNU LGPL
-; 
-; Please refer to the file COPYING.LIB contained in the distribution for
-; licensing conditions		
-;
-
-BITS 32
-
-%include "common.inc"
-
-SDL_FUNC _ConvertMMX
-
-SECTION .text
-		
-;; _ConvertMMX:	 
-;; [ESP+8] ConverterInfo*
-;; --------------------------------------------------------------------------
-;; ConverterInfo (ebp+..)
-;;   0:	void *s_pixels
-;;   4:	int s_width
-;;   8:	int s_height
-;;  12:	int s_add
-;;  16:	void *d_pixels
-;;  20:	int d_width
-;;  24:	int d_height
-;;  28:	int d_add
-;;  32:	void (*converter_function)() 
-;;  36: int32 *lookup
-	
-_ConvertMMX:
-	push ebp
-	mov ebp,esp
-
-; Save the registers used by the blitters, necessary for optimized code
-	pusha
-
-	mov eax,[ebp+8]
-
-        cmp dword [eax+4],BYTE 0
-	je endconvert
-	
-	mov ebp,eax
-	
-	mov esi,[ebp+0]
-	mov edi,[ebp+16]
-	
-y_loop:	
-	mov ecx,[ebp+4]
-
-	call [ebp+32]
-
-	add esi,[ebp+12]
-	add edi,[ebp+28]
-	
-	dec dword  [ebp+8]
-	jnz y_loop
-
-	
-; Restore the registers used by the blitters, necessary for optimized code
-	popa
-
-	pop ebp
-
-endconvert:
-	emms
-	
-	ret		
-
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
--- a/src/hermes/mmxp2_32.asm
+++ b/src/hermes/mmxp2_32.asm
-;
-; pII-optimised MMX format converters for HERMES
-; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
-;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
-; This source code is licensed under the GNU LGPL
-; 
-; Please refer to the file COPYING.LIB contained in the distribution for
-; licensing conditions		
-;
-; COPYRIGHT NOTICE
-; 
-; This file partly contains code that is (c) Intel Corporation, specifically
-; the mode detection routine, and the converter to 15 bit (8 pixel
-; conversion routine from the mmx programming tutorial pages).
-;
-;
-; These routines aren't exactly pII optimised - it's just that as they
-; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
-; optimise them for p5 MMXs..
-
-BITS 32
-
-%include "common.inc"
-	
-SDL_FUNC _ConvertMMXpII32_24RGB888
-SDL_FUNC _ConvertMMXpII32_16RGB565
-SDL_FUNC _ConvertMMXpII32_16BGR565
-SDL_FUNC _ConvertMMXpII32_16RGB555
-SDL_FUNC _ConvertMMXpII32_16BGR555
-
-;; Macros for conversion routines
-
-%macro _push_immq_mask 1
-	push dword %1
-	push dword %1
-%endmacro
-
-%macro load_immq 2
-	_push_immq_mask %2
-	movq %1, [esp]
-%endmacro
-
-%macro pand_immq 2
-	_push_immq_mask %2
-	pand %1, [esp]
-%endmacro
-
-%define CLEANUP_IMMQ_LOADS(num) \
-	add esp, byte 8 * num
-
-%define mmx32_rgb888_mask 00ffffffh
-%define mmx32_rgb565_b 000000f8h
-%define mmx32_rgb565_g 0000fc00h
-%define mmx32_rgb565_r 00f80000h
-
-%define mmx32_rgb555_rb 00f800f8h
-%define mmx32_rgb555_g 0000f800h
-%define mmx32_rgb555_mul 20000008h
-%define mmx32_bgr555_mul 00082000h
-
-SECTION .text
-
-_ConvertMMXpII32_24RGB888:
-
-        ; set up mm6 as the mask, mm7 as zero
-        load_immq mm6, mmx32_rgb888_mask
-        CLEANUP_IMMQ_LOADS(1)
-        pxor mm7, mm7
-
-        mov edx, ecx                    ; save ecx
-        and ecx, 0fffffffch             ; clear lower two bits
-        jnz .L1
-        jmp .L2
-
-.L1:
-
-        movq mm0, [esi]                 ; A R G B a r g b
-        pand mm0, mm6                   ; 0 R G B 0 r g b
-        movq mm1, [esi+8]               ; A R G B a r g b
-        pand mm1, mm6                   ; 0 R G B 0 r g b
-
-        movq mm2, mm0                   ; 0 R G B 0 r g b
-        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
-        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
-        psllq mm2, 24                   ; 0 0 R G B 0 0 0
-        por mm0, mm2                    ; 0 0 R G B r g b
-
-        movq mm3, mm1                   ; 0 R G B 0 r g b
-        psllq mm3, 48                   ; g b 0 0 0 0 0 0
-        por mm0, mm3                    ; g b R G B r g b
-
-        movq mm4, mm1                   ; 0 R G B 0 r g b
-        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
-        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
-        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
-        psllq mm4, 8                    ; 0 0 0 0 R G B 0
-        por mm1, mm4                    ; 0 0 0 0 R G B r
-
-        movq [edi], mm0
-        add esi, BYTE 16
-        movd [edi+8], mm1
-        add edi, BYTE 12
-        sub ecx, BYTE 4
-        jnz .L1
-
-.L2:
-        mov ecx, edx
-        and ecx, BYTE 3
-        jz .L4
-.L3:
-        mov al, [esi]
-        mov bl, [esi+1]
-        mov dl, [esi+2]
-        mov [edi], al
-        mov [edi+1], bl
-        mov [edi+2], dl
-        add esi, BYTE 4
-        add edi, BYTE 3
-        dec ecx
-        jnz .L3
-.L4:
-        return
-
-
-
-_ConvertMMXpII32_16RGB565:
-
-        ; set up masks
-        load_immq mm5, mmx32_rgb565_b
-        load_immq mm6, mmx32_rgb565_g
-        load_immq mm7, mmx32_rgb565_r
-        CLEANUP_IMMQ_LOADS(3)
-
-        mov edx, ecx
-        shr ecx, 2
-        jnz .L1
-        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
-
-.L1:
-        movq mm0, [esi]         ; argb
-        movq mm1, mm0           ; argb
-        pand mm0, mm6           ; 00g0
-        movq mm3, mm1           ; argb
-        pand mm1, mm5           ; 000b
-        pand mm3, mm7           ; 0r00
-        pslld mm1, 2            ; 0 0 000000bb bbb00000
-        por mm0, mm1            ; 0 0 ggggggbb bbb00000
-        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
-
-        movq mm4, [esi+8]       ; argb
-        movq mm2, mm4           ; argb
-        pand mm4, mm6           ; 00g0
-        movq mm1, mm2           ; argb
-        pand mm2, mm5           ; 000b
-        pand mm1, mm7           ; 0r00
-        pslld mm2, 2            ; 0 0 000000bb bbb00000
-        por mm4, mm2            ; 0 0 ggggggbb bbb00000
-        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
-
-        packuswb mm3, mm1       ; R 0 r 0
-        packssdw mm0, mm4       ; as above.. ish
-        por mm0, mm3            ; done.
-        movq [edi], mm0
-
-        add esi, 16
-        add edi, 8
-        dec ecx
-        jnz .L1
-
-.L2:
-        mov ecx, edx
-        and ecx, BYTE 3
-        jz .L4
-.L3:
-        mov al, [esi]
-        mov bh, [esi+1]
-        mov ah, [esi+2]
-        shr al, 3
-        and eax, 0F81Fh            ; BYTE?
-        shr ebx, 5
-        and ebx, 07E0h             ; BYTE?
-        add eax, ebx
-        mov [edi], al
-        mov [edi+1], ah
-        add esi, BYTE 4
-        add edi, BYTE 2
-        dec ecx
-        jnz .L3
-
-.L4:
-	retn
-
-	
-_ConvertMMXpII32_16BGR565:
-
-        load_immq mm5, mmx32_rgb565_r
-        load_immq mm6, mmx32_rgb565_g
-        load_immq mm7, mmx32_rgb565_b
-        CLEANUP_IMMQ_LOADS(3)
-
-        mov edx, ecx
-        shr ecx, 2
-        jnz .L1
-        jmp .L2
-
-.L1:
-        movq mm0, [esi]                 ; a r g b
-        movq mm1, mm0                   ; a r g b
-        pand mm0, mm6                   ; 0 0 g 0
-        movq mm3, mm1                   ; a r g b
-        pand mm1, mm5                   ; 0 r 0 0
-        pand mm3, mm7                   ; 0 0 0 b
-
-        psllq mm3, 16                   ; 0 b 0 0
-        psrld mm1, 14                   ; 0 0 000000rr rrr00000
-        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
-        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
-
-        movq mm4, [esi+8]               ; a r g b
-        movq mm2, mm4                   ; a r g b
-        pand mm4, mm6                   ; 0 0 g 0
-        movq mm1, mm2                   ; a r g b
-        pand mm2, mm5                   ; 0 r 0 0
-        pand mm1, mm7                   ; 0 0 0 b
-
-        psllq mm1, 16                   ; 0 b 0 0
-        psrld mm2, 14                   ; 0 0 000000rr rrr00000
-        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
-        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
-
-        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
-        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
-        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
-        movq [edi], mm0
-
-        add esi, BYTE 16
-        add edi, BYTE 8
-        dec ecx
-        jnz .L1
-
-.L2:
-        and edx, BYTE 3
-        jz .L4
-.L3:
-        mov al, [esi+2]
-        mov bh, [esi+1]
-        mov ah, [esi]
-        shr al, 3
-        and eax, 0F81Fh                    ; BYTE ?
-        shr ebx, 5
-        and ebx, 07E0h                     ; BYTE ?
-        add eax, ebx
-        mov [edi], al
-        mov [edi+1], ah
-        add esi, BYTE 4
-        add edi, BYTE 2
-        dec edx
-        jnz .L3
-
-.L4:
-        retn
-
-_ConvertMMXpII32_16BGR555:
-
-        ; the 16BGR555 converter is identical to the RGB555 one,
-        ; except it uses a different multiplier for the pmaddwd
-        ; instruction.  cool huh.
-
-        load_immq mm7, mmx32_bgr555_mul
-        jmp _convert_bgr555_cheat
-
-; This is the same as the Intel version.. they obviously went to
-; much more trouble to expand/coil the loop than I did, so theirs
-; would almost certainly be faster, even if only a little.
-; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
-; (I think) a more accurate name..
-_ConvertMMXpII32_16RGB555:
-
-	load_immq mm7, mmx32_rgb555_mul
-_convert_bgr555_cheat:
-	load_immq mm6, mmx32_rgb555_g
-	CLEANUP_IMMQ_LOADS(2)
-        
-	mov edx,ecx		           ; Save ecx 
-
-        and ecx,DWORD 0fffffff8h            ; clear lower three bits
-	jnz .L_OK
-        jmp near .L2 
-
-.L_OK:
-	
-	movq mm2,[esi+8]
-
-	movq mm0,[esi]
-	movq mm3,mm2
-
-	pand_immq mm3, mmx32_rgb555_rb
-	movq mm1,mm0
-
-	pand_immq mm1, mmx32_rgb555_rb
-	pmaddwd mm3,mm7
-
-	CLEANUP_IMMQ_LOADS(2)
-
-	pmaddwd mm1,mm7
-	pand mm2,mm6
-
-.L1:
-	movq mm4,[esi+24]
-	pand mm0,mm6
-
-	movq mm5,[esi+16]
-	por mm3,mm2
-
-	psrld mm3,6
-	por mm1,mm0
-
-	movq mm0,mm4
-	psrld mm1,6
-
-	pand_immq mm0, mmx32_rgb555_rb
-	packssdw mm1,mm3
-
-	movq mm3,mm5
-	pmaddwd mm0,mm7
-
-	pand_immq mm3, mmx32_rgb555_rb
-	pand mm4,mm6
-
-	movq [edi],mm1			
-	pmaddwd mm3,mm7
-
-        add esi,BYTE 32
-	por mm4,mm0
-
-	pand mm5,mm6
-	psrld mm4,6
-
-	movq mm2,[esi+8]
-	por mm5,mm3
-
-	movq mm0,[esi]
-	psrld mm5,6
-
-	movq mm3,mm2
-	movq mm1,mm0
-
-	pand_immq mm3, mmx32_rgb555_rb
-	packssdw mm5,mm4
-
-	pand_immq mm1, mmx32_rgb555_rb
-	pand mm2,mm6
-
-	CLEANUP_IMMQ_LOADS(4)
-
-	movq [edi+8],mm5
-	pmaddwd mm3,mm7
-
-	pmaddwd mm1,mm7
-        add edi,BYTE 16
-	
-        sub ecx,BYTE 8
-	jz .L2
-        jmp .L1
-
-
-.L2:	
-	mov ecx,edx
-	
-        and ecx,BYTE 7
-	jz .L4
-	
-.L3:	
-	mov ebx,[esi]
-        add esi,BYTE 4
-	
-        mov eax,ebx
-        mov edx,ebx
-
-        shr eax,3
-        shr edx,6
-
-        and eax,BYTE 0000000000011111b
-        and edx,     0000001111100000b
-
-        shr ebx,9
-
-        or eax,edx
-
-        and ebx,     0111110000000000b
-
-        or eax,ebx
-
-        mov [edi],ax
-        add edi,BYTE 2
-
-	dec ecx
-	jnz .L3	
-
-.L4:		
-	retn
-
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
--- a/src/hermes/x86_main.asm
+++ b/src/hermes/x86_main.asm
-;
-; x86 format converters for HERMES
-; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
-; This source code is licensed under the GNU LGPL
-; 
-; Please refer to the file COPYING.LIB contained in the distribution for
-; licensing conditions		
-;
-; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
-; 
-
-BITS 32
-
-%include "common.inc"
-
-SDL_FUNC _ConvertX86
-
-SECTION .text
-		
-;; _ConvertX86:	 
-;; [ESP+8] ConverterInfo*
-;; --------------------------------------------------------------------------
-;; ConverterInfo (ebp+..)
-;;   0:	void *s_pixels
-;;   4:	int s_width
-;;   8:	int s_height
-;;  12:	int s_add
-;;  16:	void *d_pixels
-;;  20:	int d_width
-;;  24:	int d_height
-;;  28:	int d_add
-;;  32:	void (*converter_function)() 
-;;  36: int32 *lookup
-	
-_ConvertX86:
-	push ebp
-	mov ebp,esp
-
-; Save the registers used by the blitters, necessary for optimized code
-	pusha
-
-	mov eax,[ebp+8]
-
-        cmp dword [eax+4],BYTE 0
-	je endconvert
-	
-	mov ebp,eax
-	
-	mov esi,[ebp+0]
-	mov edi,[ebp+16]
-	
-y_loop:	
-	mov ecx,[ebp+4]
-
-	call [ebp+32]
-
-	add esi,[ebp+12]
-	add edi,[ebp+28]
-	
-	dec dword  [ebp+8]
-	jnz y_loop
-
-; Restore the registers used by the blitters, necessary for optimized code
-	popa
-	
-	pop ebp
-
-endconvert:	
-	ret		
-
-
-
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
--- a/src/hermes/x86p_16.asm
+++ b/src/hermes/x86p_16.asm
-;
-; x86 format converters for HERMES
-; Copyright (c) 1998 Glenn Fielder (gaffer@gaffer.org)
-; This source code is licensed under the GNU LGPL
-; 
-; Please refer to the file COPYING.LIB contained in the distribution for
-; licensing conditions		
-; 
-; Routines adjusted for Hermes by Christian Nentwich (brn@eleet.mcb.at)
-; Used with permission.
-; 
-
-BITS 32
-
-%include "common.inc"
-
-SDL_FUNC _ConvertX86p16_16BGR565
-SDL_FUNC _ConvertX86p16_16RGB555
-SDL_FUNC _ConvertX86p16_16BGR555
-SDL_FUNC _ConvertX86p16_8RGB332
-
-EXTERN _ConvertX86
-
-SECTION .text
-
-_ConvertX86p16_16BGR565:
-
-    ; check short
-    cmp ecx,BYTE 16
-    ja .L3
-
-
-.L1 ; short loop
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    and ebx,11111100000b
-    shl edx,11
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-    jnz .L1
-.L2
-    retn
-
-.L3 ; head
-    mov eax,edi
-    and eax,BYTE 11b
-    jz .L4
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    and ebx,11111100000b
-    shl edx,11
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-
-.L4 ; save count
-    push ecx
-
-    ; unroll twice
-    shr ecx,1
-    
-    ; point arrays to end
-    lea esi,[esi+ecx*4]
-    lea edi,[edi+ecx*4]
-
-    ; negative counter 
-    neg ecx
-    jmp SHORT .L6
-                              
-.L5     mov [edi+ecx*4-4],eax
-.L6     mov eax,[esi+ecx*4]
-
-        mov ebx,[esi+ecx*4]
-        and eax,07E007E0h         
-
-        mov edx,[esi+ecx*4]
-        and ebx,0F800F800h
-
-        shr ebx,11
-        and edx,001F001Fh
-
-        shl edx,11
-        add eax,ebx
-
-        add eax,edx                 
-        inc ecx
-
-        jnz .L5                 
-         
-    mov [edi+ecx*4-4],eax
-
-    ; tail
-    pop ecx
-    and ecx,BYTE 1
-    jz .L7
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    and ebx,11111100000b
-    shl edx,11
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-
-.L7
-    retn
-
-
-
-
-
-
-_ConvertX86p16_16RGB555:
-
-    ; check short
-    cmp ecx,BYTE 32
-    ja .L3
-
-
-.L1 ; short loop
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    shr ebx,1
-    and ebx,     0111111111100000b
-    and eax,BYTE 0000000000011111b
-    add eax,ebx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-    jnz .L1
-.L2
-    retn
-
-.L3 ; head
-    mov eax,edi
-    and eax,BYTE 11b
-    jz .L4
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    shr ebx,1
-    and ebx,     0111111111100000b
-    and eax,BYTE 0000000000011111b
-    add eax,ebx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-
-.L4 ; save ebp
-    push ebp
-
-    ; save count
-    push ecx
-
-    ; unroll four times
-    shr ecx,2
-    
-    ; point arrays to end
-    lea esi,[esi+ecx*8]
-    lea edi,[edi+ecx*8]
-
-    ; negative counter 
-    xor ebp,ebp
-    sub ebp,ecx
-
-.L5     mov eax,[esi+ebp*8]        ; agi?
-        mov ecx,[esi+ebp*8+4]
-       
-        mov ebx,eax
-        mov edx,ecx
-
-        and eax,0FFC0FFC0h
-        and ecx,0FFC0FFC0h
-
-        shr eax,1
-        and ebx,001F001Fh
-
-        shr ecx,1
-        and edx,001F001Fh
-
-        add eax,ebx
-        add ecx,edx
-
-        mov [edi+ebp*8],eax
-        mov [edi+ebp*8+4],ecx
-
-        inc ebp
-        jnz .L5                 
-
-    ; tail
-    pop ecx
-.L6 and ecx,BYTE 11b
-    jz .L7
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    shr ebx,1
-    and ebx,     0111111111100000b
-    and eax,BYTE 0000000000011111b
-    add eax,ebx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-    jmp SHORT .L6
-
-.L7 pop ebp
-    retn
-
-
-
-
-
-
-_ConvertX86p16_16BGR555:
-
-    ; check short
-    cmp ecx,BYTE 16
-    ja .L3
-
-	
-.L1 ; short loop
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    shr ebx,1
-    and ebx,1111100000b
-    shl edx,10
-    and edx,0111110000000000b
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-    jnz .L1
-.L2
-    retn
-
-.L3 ; head
-    mov eax,edi
-    and eax,BYTE 11b
-    jz .L4
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    shr ebx,1
-    and ebx,1111100000b
-    shl edx,10
-    and edx,0111110000000000b
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-
-.L4 ; save count
-    push ecx
-
-    ; unroll twice
-    shr ecx,1
-    
-    ; point arrays to end
-    lea esi,[esi+ecx*4]
-    lea edi,[edi+ecx*4]
-
-    ; negative counter 
-    neg ecx
-    jmp SHORT .L6
-                              
-.L5     mov [edi+ecx*4-4],eax
-.L6     mov eax,[esi+ecx*4]
-
-        shr eax,1
-        mov ebx,[esi+ecx*4]
-        
-        and eax,03E003E0h         
-        mov edx,[esi+ecx*4]
-
-        and ebx,0F800F800h
-
-        shr ebx,11
-        and edx,001F001Fh
-
-        shl edx,10
-        add eax,ebx
-
-        add eax,edx                 
-        inc ecx
-
-        jnz .L5                 
-         
-    mov [edi+ecx*4-4],eax
-
-    ; tail
-    pop ecx
-    and ecx,BYTE 1
-    jz .L7
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    shr ebx,1
-    and ebx,1111100000b
-    shl edx,10
-    and edx,0111110000000000b
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-
-.L7
-    retn
-
-
-
-
-
-
-_ConvertX86p16_8RGB332:
-
-    ; check short
-    cmp ecx,BYTE 16
-    ja .L3
-
-
-.L1 ; short loop
-    mov al,[esi+0]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    and eax,BYTE 11000b         ; blue
-    shr eax,3
-    and ebx,11100000000b        ; green
-    shr ebx,6
-    and edx,1110000000000000b   ; red
-    shr edx,8
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    add esi,BYTE 2
-    inc edi
-    dec ecx
-    jnz .L1
-.L2
-    retn
-
-.L3 mov eax,edi
-    and eax,BYTE 11b
-    jz .L4
-    mov al,[esi+0]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    and eax,BYTE 11000b         ; blue
-    shr eax,3
-    and ebx,11100000000b        ; green
-    shr ebx,6
-    and edx,1110000000000000b   ; red
-    shr edx,8
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    add esi,BYTE 2
-    inc edi
-    dec ecx
-    jmp SHORT .L3
-
-.L4 ; save ebp
-    push ebp
-
-    ; save count
-    push ecx
-
-    ; unroll 4 times
-    shr ecx,2
-
-    ; prestep
-    mov dl,[esi+0]
-    mov bl,[esi+1]
-    mov dh,[esi+2]
-        
-.L5     shl edx,16
-        mov bh,[esi+3]
-        
-        shl ebx,16
-        mov dl,[esi+4]
-
-        mov dh,[esi+6]
-        mov bl,[esi+5]
-
-        and edx,00011000000110000001100000011000b
-        mov bh,[esi+7]
-
-        ror edx,16+3
-        mov eax,ebx                                     ; setup eax for reds
-
-        and ebx,00000111000001110000011100000111b
-        and eax,11100000111000001110000011100000b       ; reds
-
-        ror ebx,16-2
-        add esi,BYTE 8
-
-        ror eax,16
-        add edi,BYTE 4
-
-        add eax,ebx
-        mov bl,[esi+1]                                  ; greens
-
-        add eax,edx
-        mov dl,[esi+0]                                  ; blues
-
-        mov [edi-4],eax
-        mov dh,[esi+2]
-
-        dec ecx
-        jnz .L5                 
-    
-    ; check tail
-    pop ecx
-    and ecx,BYTE 11b
-    jz .L7
-
-.L6 ; tail
-    mov al,[esi+0]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    and eax,BYTE 11000b         ; blue
-    shr eax,3
-    and ebx,11100000000b        ; green
-    shr ebx,6
-    and edx,1110000000000000b   ; red
-    shr edx,8
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    add esi,BYTE 2
-    inc edi
-    dec ecx
-    jnz .L6
-
-.L7 pop ebp
-    retn
-
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
--- a/src/hermes/x86p_32.asm
+++ b/src/hermes/x86p_32.asm
--- a/src/video/SDL_blit.c
+++ b/src/video/SDL_blit.c
@@ -24,6 +24,7 @@
 #include "SDL_video.h"
 #include "SDL_sysvideo.h"
 #include "SDL_blit.h"
+#include "SDL_blit_copy.h"
 #include "SDL_RLEaccel_c.h"
 #include "SDL_pixels_c.h"

@@ -106,111 +107,64 @@ SDL_SoftBlit(SDL_Surface * src, SDL_Rect * srcrect,
    return (okay ? 0 : -1);
 }

-#ifdef MMX_ASMBLIT
-static __inline__ void
-SDL_memcpyMMX(Uint8 * to, const Uint8 * from, int len)
-{
-    int i;
+#ifdef __MACOSX__
+#include <sys/sysctl.h>

-    for (i = 0; i < len / 8; i++) {
-        __asm__ __volatile__("	movq (%0), %%mm0\n"
-                             "	movq %%mm0, (%1)\n"::"r"(from),
-                             "r"(to):"memory");
-        from += 8;
-        to += 8;
-    }
-    if (len & 7)
-        SDL_memcpy(to, from, len & 7);
-}
-
-static __inline__ void
-SDL_memcpySSE(Uint8 * to, const Uint8 * from, int len)
+static SDL_bool SDL_UseAltivecPrefetch()
 {
-    int i;
-
-    __asm__ __volatile__("	prefetchnta (%0)\n"
-                         "	prefetchnta 64(%0)\n"
-                         "	prefetchnta 128(%0)\n"
-                         "	prefetchnta 192(%0)\n"::"r"(from));
+    const char key[] = "hw.l3cachesize";
+    u_int64_t result = 0;
+    size_t typeSize = sizeof(result);

-    for (i = 0; i < len / 8; i++) {
-        __asm__ __volatile__("	prefetchnta 256(%0)\n"
-                             "	movq (%0), %%mm0\n"
-                             "	movntq %%mm0, (%1)\n"::"r"(from),
-                             "r"(to):"memory");
-        from += 8;
-        to += 8;
+    if (sysctlbyname(key, &result, &typeSize, NULL, 0) == 0 && result > 0) {
+        return SDL_TRUE;
+    } else {
+        return SDL_FALSE;
    }
-    if (len & 7)
-        SDL_memcpy(to, from, len & 7);
 }
-#endif
-
-static void
-SDL_BlitCopy(SDL_BlitInfo * info)
+#else
+static SDL_bool SDL_UseAltivecPrefetch()
 {
-    Uint8 *src, *dst;
-    int w, h;
-    int srcskip, dstskip;
-
-    w = info->d_width * info->dst->BytesPerPixel;
-    h = info->d_height;
-    src = info->s_pixels;
-    dst = info->d_pixels;
-    srcskip = w + info->s_skip;
-    dstskip = w + info->d_skip;
-#ifdef MMX_ASMBLIT
-    if (SDL_HasSSE()) {
-        while (h--) {
-            SDL_memcpySSE(dst, src, w);
-            src += srcskip;
-            dst += dstskip;
-        }
-        __asm__ __volatile__("	emms\n"::);
-    } else if (SDL_HasMMX()) {
-        while (h--) {
-            SDL_memcpyMMX(dst, src, w);
-            src += srcskip;
-            dst += dstskip;
-        }
-        __asm__ __volatile__("	emms\n"::);
-    } else
-#endif
-        while (h--) {
-            SDL_memcpy(dst, src, w);
-            src += srcskip;
-            dst += dstskip;
-        }
+    /* Just guess G4 */
+    return SDL_TRUE;
 }
+#endif /* __MACOSX__ */

-static void
-SDL_BlitCopyOverlap(SDL_BlitInfo * info)
+static SDL_loblit SDL_ChooseBlitFunc(SDL_BlitEntry *entries, int count)
 {
-    Uint8 *src, *dst;
-    int w, h;
-    int srcskip, dstskip;
+    int i;
+    static Uint32 features = 0xffffffff;
+
+    if (features == 0xffffffff) {
+        features = SDL_BLIT_ANY;

-    w = info->d_width * info->dst->BytesPerPixel;
-    h = info->d_height;
-    src = info->s_pixels;
-    dst = info->d_pixels;
-    srcskip = w + info->s_skip;
-    dstskip = w + info->d_skip;
-    if (dst < src) {
-        while (h--) {
-            SDL_memcpy(dst, src, w);
-            src += srcskip;
-            dst += dstskip;
+        /* Provide an override for testing .. */
+        const char *override = SDL_getenv("SDL_BLIT_FEATURES");
+        if (override) {
+            SDL_sscanf(override, "%u", &features);
+        } else {
+            if (SDL_HasMMX()) {
+                features |= SDL_BLIT_MMX;
+            }
+            if (SDL_HasSSE()) {
+                features |= SDL_BLIT_SSE;
+            }
+            if (SDL_HasAltivec()) {
+                if (SDL_UseAltivecPrefetch()) {
+                    features |= SDL_BLIT_ALTIVEC_PREFETCH;
+                } else {
+                    features |= SDL_BLIT_ALTIVEC_NOPREFETCH;
+                }
+            }
        }
-    } else {
-        src += ((h - 1) * srcskip);
-        dst += ((h - 1) * dstskip);
-        while (h--) {
-            SDL_revcpy(dst, src, w);
-            src -= srcskip;
-            dst -= dstskip;
+    }
+
+    for (i = count; i > 0; --i) {
+        if (features & entries[i].features) {
+            return entries[i].blit;
        }
    }
+    return entries[0].blit;
 }

 /* Figure out which of many blit routines to set up on a surface */
@@ -237,11 +191,11 @@ SDL_CalculateBlit(SDL_Surface * surface)

    /* Check for special "identity" case -- copy blit */
    if (surface->map->identity && blit_index == 0) {
-        surface->map->sw_data->blit = SDL_BlitCopy;
-
        /* Handle overlapping blits on the same surface */
        if (surface == surface->map->dst) {
            surface->map->sw_data->blit = SDL_BlitCopyOverlap;
+        } else {
+            surface->map->sw_data->blit = SDL_BlitCopy;
        }
    } else {
        if (surface->format->BitsPerPixel < 8) {

--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -67,6 +67,17 @@ typedef struct SDL_BlitMap
    unsigned int format_version;
 } SDL_BlitMap;

+#define SDL_BLIT_ANY                0x00000000
+#define SDL_BLIT_MMX                0x00000001
+#define SDL_BLIT_SSE                0x00000002
+#define SDL_BLIT_ALTIVEC_PREFETCH   0x00000004
+#define SDL_BLIT_ALTIVEC_NOPREFETCH 0x00000008
+
+typedef struct SDL_BlitEntry
+{
+    Uint32 features;
+    SDL_loblit blit;
+} SDL_BlitEntry;

 /* Functions found in SDL_blit.c */
 extern int SDL_CalculateBlit(SDL_Surface * surface);

--- a/src/video/SDL_blit_N.c
+++ b/src/video/SDL_blit_N.c
@@ -879,19 +879,6 @@ GetBlitFeatures(void)
 #define LO	1
 #endif

-#if SDL_HERMES_BLITTERS
-
-/* Heheheh, we coerce Hermes into using SDL blit information */
-#define X86_ASSEMBLER
-#define HermesConverterInterface	SDL_BlitInfo
-#define HermesClearInterface		void
-#define STACKCALL
-
-#include "../hermes/HeadMMX.h"
-#include "../hermes/HeadX86.h"
-
-#else
-
 /* Special optimized blit for RGB 8-8-8 --> RGB 3-3-2 */
 #define RGB888_RGB332(dst, src) { \
 	dst = (Uint8)((((src)&0x00E00000)>>16)| \
@@ -1250,8 +1237,6 @@ Blit_RGB888_RGB565(SDL_BlitInfo * info)
 #endif /* USE_DUFFS_LOOP */
 }

-#endif /* SDL_HERMES_BLITTERS */
-

 /* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */
 #define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1])
@@ -2357,17 +2342,7 @@ static const struct blit_table normal_blit_1[] = {
    {0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL},
 };
 static const struct blit_table normal_blit_2[] = {
-#if SDL_HERMES_BLITTERS
-    {0x0000F800, 0x000007E0, 0x0000001F, 2, 0x0000001F, 0x000007E0,
-     0x0000F800,
-     0, ConvertX86p16_16BGR565, ConvertX86, NO_ALPHA},
-    {0x0000F800, 0x000007E0, 0x0000001F, 2, 0x00007C00, 0x000003E0,
-     0x0000001F,
-     0, ConvertX86p16_16RGB555, ConvertX86, NO_ALPHA},
-    {0x0000F800, 0x000007E0, 0x0000001F, 2, 0x0000001F, 0x000003E0,
-     0x00007C00,
-     0, ConvertX86p16_16BGR555, ConvertX86, NO_ALPHA},
-#elif SDL_ALTIVEC_BLITTERS
+#if SDL_ALTIVEC_BLITTERS
    /* has-altivec */
    {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00000000, 0x00000000,
     0x00000000,
@@ -2397,47 +2372,6 @@ static const struct blit_table normal_blit_3[] = {
    {0, 0, 0, 0, 0, 0, 0, 0, NULL, BlitNtoN, 0}
 };
 static const struct blit_table normal_blit_4[] = {
-#if SDL_HERMES_BLITTERS
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0,
-     0x0000001F,
-     1, ConvertMMXpII32_16RGB565, ConvertMMX, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0,
-     0x0000001F,
-     0, ConvertX86p32_16RGB565, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000007E0,
-     0x0000F800,
-     1, ConvertMMXpII32_16BGR565, ConvertMMX, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000007E0,
-     0x0000F800,
-     0, ConvertX86p32_16BGR565, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
-     0x0000001F,
-     1, ConvertMMXpII32_16RGB555, ConvertMMX, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
-     0x0000001F,
-     0, ConvertX86p32_16RGB555, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000003E0,
-     0x00007C00,
-     1, ConvertMMXpII32_16BGR555, ConvertMMX, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000003E0,
-     0x00007C00,
-     0, ConvertX86p32_16BGR555, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 3, 0x00FF0000, 0x0000FF00,
-     0x000000FF,
-     0, ConvertX86p32_24RGB888, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 3, 0x000000FF, 0x0000FF00,
-     0x00FF0000,
-     0, ConvertX86p32_24BGR888, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0x000000FF, 0x0000FF00,
-     0x00FF0000,
-     0, ConvertX86p32_32BGR888, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0xFF000000, 0x00FF0000,
-     0x0000FF00,
-     0, ConvertX86p32_32RGBA888, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0x0000FF00, 0x00FF0000,
-     0xFF000000,
-     0, ConvertX86p32_32BGRA888, ConvertX86, NO_ALPHA},
-#else
 #if SDL_ALTIVEC_BLITTERS
    /* has-altivec | dont-use-prefetch */
    {0x00000000, 0x00000000, 0x00000000, 4, 0x00000000, 0x00000000,
@@ -2460,7 +2394,6 @@ static const struct blit_table normal_blit_4[] = {
    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
     0x0000001F,
     0, NULL, Blit_RGB888_RGB555, NO_ALPHA},
-#endif
    /* Default for 32-bit RGB source, used if no other blitter matches */
    {0, 0, 0, 0, 0, 0, 0, 0, NULL, BlitNtoN, 0}
 };
@@ -2529,12 +2462,7 @@ SDL_CalculateBlitN(SDL_Surface * surface, int blit_index)
            if (surface->map->table) {
                blitfun = Blit_RGB888_index8_map;
            } else {
-#if SDL_HERMES_BLITTERS
-                sdata->aux_data = ConvertX86p32_8RGB332;
-                blitfun = ConvertX86;
-#else
                blitfun = Blit_RGB888_index8;
-#endif
            }
        } else {
            blitfun = BlitNto1;
@@ -2575,13 +2503,6 @@ SDL_CalculateBlitN(SDL_Surface * surface, int blit_index)
    }

 #ifdef DEBUG_ASM
-#if SDL_HERMES_BLITTERS
-    if (blitfun == ConvertMMX)
-        fprintf(stderr, "Using mmx blit\n");
-    else if (blitfun == ConvertX86)
-        fprintf(stderr, "Using asm blit\n");
-    else
-#endif
    if ((blitfun == BlitNtoN) || (blitfun == BlitNto1))
        fprintf(stderr, "Using C blit\n");
    else

--- a/src/video/SDL_blit_copy.c
+++ b/src/video/SDL_blit_copy.c
+/*
+    SDL - Simple DirectMedia Layer
+    Copyright (C) 1997-2006 Sam Lantinga
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    Sam Lantinga
+    slouken@libsdl.org
+*/
+#include "SDL_config.h"
+
+#include "SDL_video.h"
+#include "SDL_blit.h"
+
+/* The MMX/SSE intrinsics don't give access to specific registers for
+   the most memory parallelism, so we'll use GCC inline assembly here...
+*/
+#ifndef __GNUC__
+#undef __MMX__
+#undef __SSE__
+#endif
+
+#ifdef __MMX__
+static __inline__ void
+SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
+{
+    int i;
+
+    for (i = len / 64; i--;) {
+        __asm__ __volatile__ (
+        "prefetchnta (%0)\n"
+        "movq (%0), %%mm0\n"
+        "movq 8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n"
+        "movq 32(%0), %%mm4\n"
+        "movq 40(%0), %%mm5\n"
+        "movq 48(%0), %%mm6\n"
+        "movq 56(%0), %%mm7\n"
+        "movntq %%mm0, (%1)\n"
+        "movntq %%mm1, 8(%1)\n"
+        "movntq %%mm2, 16(%1)\n"
+        "movntq %%mm3, 24(%1)\n"
+        "movntq %%mm4, 32(%1)\n"
+        "movntq %%mm5, 40(%1)\n"
+        "movntq %%mm6, 48(%1)\n"
+        "movntq %%mm7, 56(%1)\n"
+        :: "r" (src), "r" (dst) : "memory");
+        src += 64;
+        dst += 64;
+    }
+    if (len & 63)
+        SDL_memcpy(dst, src, len & 63);
+}
+#endif /* __MMX__ */
+
+#ifdef __SSE__
+static __inline__ void
+SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
+{
+    int i;
+
+    for (i = len / 64; i--;) {
+        __asm__ __volatile__ (
+        "prefetchnta (%0)\n"
+        "movaps (%0), %%xmm0\n"
+        "movaps 16(%0), %%xmm1\n"
+        "movaps 32(%0), %%xmm2\n"
+        "movaps 48(%0), %%xmm3\n"
+        "movntps %%xmm0, (%1)\n"
+        "movntps %%xmm1, 16(%1)\n"
+        "movntps %%xmm2, 32(%1)\n"
+        "movntps %%xmm3, 48(%1)\n"
+        :: "r" (src), "r" (dst) : "memory");
+        src += 64;
+        dst += 64;
+    }
+    if (len & 63)
+        SDL_memcpy(dst, src, len & 63);
+}
+#endif /* __SSE__ */
+
+void
+SDL_BlitCopy(SDL_BlitInfo * info)
+{
+    Uint8 *src, *dst;
+    int w, h;
+    int srcskip, dstskip;
+
+    w = info->d_width * info->dst->BytesPerPixel;
+    h = info->d_height;
+    src = info->s_pixels;
+    dst = info->d_pixels;
+    srcskip = w + info->s_skip;
+    dstskip = w + info->d_skip;
+
+#ifdef __SSE__
+    if (SDL_HasSSE() && !((uintptr_t)src & 15) && !((uintptr_t)dst & 15)) {
+        while (h--) {
+            SDL_memcpySSE(dst, src, w);
+            src += srcskip;
+            dst += dstskip;
+        }
+        return;
+    }
+#endif
+
+#ifdef __MMX__
+    if (SDL_HasMMX() && !((uintptr_t)src & 7) && !((uintptr_t)dst & 7)) {
+        while (h--) {
+            SDL_memcpyMMX(dst, src, w);
+            src += srcskip;
+            dst += dstskip;
+        }
+        __asm__ __volatile__("	emms\n"::);
+        return;
+    }
+#endif
+
+    while (h--) {
+        SDL_memcpy(dst, src, w);
+        src += srcskip;
+        dst += dstskip;
+    }
+}
+
+void
+SDL_BlitCopyOverlap(SDL_BlitInfo * info)
+{
+    Uint8 *src, *dst;
+    int w, h;
+    int skip;
+
+    w = info->d_width * info->dst->BytesPerPixel;
+    h = info->d_height;
+    src = info->s_pixels;
+    dst = info->d_pixels;
+    skip = w + info->s_skip;
+    if ((dst < src) || (dst >= (src + h*skip))) {
+        SDL_BlitCopy(info);
+    } else {
+        src += ((h - 1) * skip);
+        dst += ((h - 1) * skip);
+        while (h--) {
+            SDL_revcpy(dst, src, w);
+            src -= skip;
+            dst -= skip;
+        }
+    }
+}
+
+/* vi: set ts=4 sw=4 expandtab: */
--- a/src/video/SDL_blit_copy.h
+++ b/src/video/SDL_blit_copy.h
+/*
+    SDL - Simple DirectMedia Layer
+    Copyright (C) 1997-2006 Sam Lantinga
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    Sam Lantinga
+    slouken@libsdl.org
+*/
+
+void SDL_BlitCopy(SDL_BlitInfo * info);
+void SDL_BlitCopyOverlap(SDL_BlitInfo * info);
+
+/* vi: set ts=4 sw=4 expandtab: */