Removed hermes since it's LGPL and not compatible with a commercial license.

Prepping for using MMX and SSE intrinsics instead of inline assembly. .. except for memcpy equivalents which only get faster if they can exploit the parallelism of loading into multiple SIMD registers. :) --HG-- extra : convert_revision : svn%3Ac70aab31-4412-0410-b14c-859654838e24/trunk%402609

Removed hermes since it's LGPL and not compatible with a commercial license.
Prepping for using MMX and SSE intrinsics instead of inline assembly. .. except for memcpy equivalents which only get faster if they can exploit the parallelism of loading into multiple SIMD registers. :) --HG-- extra : convert_revision : svn%3Ac70aab31-4412-0410-b14c-859654838e24/trunk%402609
37fe3a93 · Sam Lantinga · 92c5ea48 · 37fe3a93 · 37fe3a93 · 37fe3a93
Commit 37fe3a93 authored Aug 15, 2007 by Sam Lantinga
18 changed files
--- a/build-scripts/makedep.sh
+++ b/build-scripts/makedep.sh
@@ -65,12 +65,6 @@ __EOF__
 	\$(LIBTOOL) --mode=compile \$(CC) \$(CFLAGS) \$(EXTRA_CFLAGS) -c $src  -o \$@
-__EOF__
-        ;;
-        asm) cat >>${output}.new <<__EOF__
-	\$(LIBTOOL) --tag=CC --mode=compile \$(auxdir)/strip_fPIC.sh \$(NASM) $src -o \$@
 __EOF__
        ;;
        S) cat >>${output}.new <<__EOF__

--- a/configure.in
+++ b/configure.in
@@ -276,6 +276,127 @@ AC_HELP_STRING([--enable-assembly], [Enable assembly routines [[default=yes]]]),
              , enable_assembly=yes)
 if test x$enable_assembly = xyes; then
    AC_DEFINE(SDL_ASSEMBLY_ROUTINES)
+    dnl Check for various instruction support
+    AC_ARG_ENABLE(mmx,
+AC_HELP_STRING([--enable-mmx], [use MMX assembly routines [[default=yes]]]),
+                  , enable_mmx=yes)
+    if test x$enable_mmx = xyes; then
+        save_CFLAGS="$CFLAGS"
+        have_gcc_mmx=no
+        AC_MSG_CHECKING(for GCC -mmmx option)
+        mmx_CFLAGS="-mmmx"
+        CFLAGS="$save_CFLAGS $mmx_CFLAGS"
+        AC_TRY_COMPILE([
+        #include <mmintrin.h>
+        ],[
+        ],[
+        have_gcc_mmx=yes
+        ])
+        AC_MSG_RESULT($have_gcc_mmx)
+        if test x$have_gcc_mmx = xyes; then
+            EXTRA_CFLAGS="$EXTRA_CFLAGS $mmx_CFLAGS"
+        fi
+    fi
+    AC_ARG_ENABLE(sse,
+AC_HELP_STRING([--enable-sse], [use SSE assembly routines [[default=yes]]]),
+                  , enable_sse=yes)
+    if test x$enable_sse = xyes; then
+        save_CFLAGS="$CFLAGS"
+        have_gcc_sse=no
+        AC_MSG_CHECKING(for GCC -msse option)
+        sse_CFLAGS="-msse"
+        CFLAGS="$save_CFLAGS $sse_CFLAGS"
+        AC_TRY_COMPILE([
+        #include <xmmintrin.h>
+        ],[
+        ],[
+        have_gcc_sse=yes
+        ])
+        AC_MSG_RESULT($have_gcc_sse)
+        if test x$have_gcc_sse = xyes; then
+            EXTRA_CFLAGS="$EXTRA_CFLAGS $sse_CFLAGS"
+        fi
+    fi
+    AC_ARG_ENABLE(altivec,
+AC_HELP_STRING([--enable-altivec], [use Altivec assembly routines [[default=yes]]]),
+                  , enable_altivec=yes)
+    if test x$enable_altivec = xyes; then
+        have_altivec_h_hdr=no
+        AC_CHECK_HEADER(altivec.h, have_altivec_h_hdr=yes)
+        save_CFLAGS="$CFLAGS"
+        have_gcc_altivec=no
+        AC_MSG_CHECKING(for Altivec with GCC -maltivec option)
+        altivec_CFLAGS="-maltivec"
+        CFLAGS="$save_CFLAGS $altivec_CFLAGS"
+        if test x$have_altivec_h_hdr = xyes; then
+          AC_TRY_COMPILE([
+          #include <altivec.h>
+          vector unsigned int vzero() {
+              return vec_splat_u32(0);
+          }
+          ],[
+          ],[
+          have_gcc_altivec=yes
+          ])
+          AC_MSG_RESULT($have_gcc_altivec)
+        else
+          AC_TRY_COMPILE([
+          vector unsigned int vzero() {
+              return vec_splat_u32(0);
+          }
+          ],[
+          ],[
+          have_gcc_altivec=yes
+          ])
+          AC_MSG_RESULT($have_gcc_altivec)
+        fi
+        if test x$have_gcc_altivec = xno; then
+            AC_MSG_CHECKING(for Altivec with GCC -faltivec option)
+            altivec_CFLAGS="-faltivec"
+            CFLAGS="$save_CFLAGS $altivec_CFLAGS"
+            if test x$have_altivec_h_hdr = xyes; then
+              AC_TRY_COMPILE([
+              #include <altivec.h>
+              vector unsigned int vzero() {
+                  return vec_splat_u32(0);
+              }
+              ],[
+              ],[
+              have_gcc_altivec=yes
+              ])
+              AC_MSG_RESULT($have_gcc_altivec)
+            else
+              AC_TRY_COMPILE([
+              vector unsigned int vzero() {
+                  return vec_splat_u32(0);
+              }
+              ],[
+              ],[
+              have_gcc_altivec=yes
+              ])
+              AC_MSG_RESULT($have_gcc_altivec)
+            fi
+        fi
+        CFLAGS="$save_CFLAGS"
+        if test x$have_gcc_altivec = xyes; then
+            AC_DEFINE(SDL_ALTIVEC_BLITTERS)
+            if test x$have_altivec_h_hdr = xyes; then
+              AC_DEFINE(HAVE_ALTIVEC_H)
+            fi
+            EXTRA_CFLAGS="$EXTRA_CFLAGS $altivec_CFLAGS"
+        fi
+    fi
 fi
 dnl See if the OSS audio interface is supported
@@ -629,167 +750,6 @@ AC_HELP_STRING([--enable-mintaudio], [support Atari audio driver [[default=yes]]
    fi
 }
-dnl See if we can use x86 assembly blitters
-# NASM is available from: http://nasm.sourceforge.net
-CheckNASM()
-{
-    dnl Make sure we are running on an x86 platform
-    case $host in
-        i?86*)
-            ;;
-        *)
-        # Nope, bail early.
-            return
-            ;;
-    esac
-    dnl Check for NASM (for assembly blit routines)
-    AC_ARG_ENABLE(nasm,
-AC_HELP_STRING([--enable-nasm], [use nasm assembly blitters on x86 [[default=yes]]]),
-                  , enable_nasm=yes)
-    if test x$enable_video = xyes -a x$enable_assembly = xyes -a x$enable_nasm = xyes; then
-        CompileNASM()
-        {
-            # Usage: CompileNASM <filename>
-            AC_MSG_CHECKING(to see if $NASM supports $1)
-            if $NASM $NASMFLAGS $1 -o $1.o >&AS_MESSAGE_LOG_FD 2>&1; then
-                CompileNASM_ret="yes"
-            else
-                CompileNASM_ret="no"
-            fi
-            rm -f $1 $1.o
-            AC_MSG_RESULT($CompileNASM_ret)
-            test "$CompileNASM_ret" = "yes"
-        }
-        if test x"$NASMFLAGS" = x; then
-            case $ARCH in
-              win32)
-                  NASMFLAGS="-f win32"
-                  ;;
-              openbsd)
-                  NASMFLAGS="-f aoutb"
-                  ;;
-              macosx)
-                  NASMFLAGS="-f macho"
-                  ;;
-              *)
-                  NASMFLAGS="-f elf"
-                  ;;
-            esac
-        fi
-        AC_PATH_PROG(NASM, yasm)
-        echo "%ifidn __OUTPUT_FORMAT__,elf" > unquoted-sections
-        echo "section .note.GNU-stack noalloc noexec nowrite progbits" >> unquoted-sections
-        echo "%endif" >> unquoted-sections
-        CompileNASM unquoted-sections || NASM=""
-        if test "x$NASM" = x -o "x$NASM" = x'"$NASM"'; then
-            $as_unset ac_cv_path_NASM
-            AC_PATH_PROG(NASM, nasm)
-        fi
-        if test "x$NASM" != x -a "x$NASM" != x'"$NASM"'; then
-            AC_DEFINE(SDL_HERMES_BLITTERS)
-            SOURCES="$SOURCES $srcdir/src/hermes/*.asm"
-            NASMFLAGS="$NASMFLAGS -I $srcdir/src/hermes/"
-            dnl See if hidden visibility is supported
-            echo "GLOBAL _bar:function hidden" > symbol-visibility
-            echo "_bar:" >> symbol-visibility
-            CompileNASM symbol-visibility && NASMFLAGS="$NASMFLAGS -DHIDDEN_VISIBILITY"
-            AC_SUBST(NASM)
-            AC_SUBST(NASMFLAGS)
-            case "$host" in
-                # this line is needed for QNX, because it's not defined the __ELF__
-                *-*-qnx*)
-                     EXTRA_CFLAGS="$EXTRA_CFLAGS -D__ELF__";;
-                *-*-solaris*)
-                     EXTRA_CFLAGS="$EXTRA_CFLAGS -D__ELF__";;
-            esac
-        fi
-    fi
-}
-dnl Check for altivec instruction support using gas syntax
-CheckAltivec()
-{
-    AC_ARG_ENABLE(altivec,
-AC_HELP_STRING([--enable-altivec], [use altivec assembly blitters on PPC [[default=yes]]]),
-                  , enable_altivec=yes)
-    if test x$enable_video = xyes -a x$enable_assembly = xyes -a x$enable_altivec = xyes; then
-        have_altivec_h_hdr=no
-        AC_CHECK_HEADER(altivec.h, have_altivec_h_hdr=yes)
-        save_CFLAGS="$CFLAGS"
-        have_gcc_altivec=no
-        AC_MSG_CHECKING(for Altivec with GCC -maltivec option)
-        altivec_CFLAGS="-maltivec"
-        CFLAGS="$save_CFLAGS $altivec_CFLAGS"
-        if test x$have_altivec_h_hdr = xyes; then
-          AC_TRY_COMPILE([
-          #include <altivec.h>
-          vector unsigned int vzero() {
-              return vec_splat_u32(0);
-          }
-          ],[
-          ],[
-          have_gcc_altivec=yes
-          ])
-          AC_MSG_RESULT($have_gcc_altivec)
-        else
-          AC_TRY_COMPILE([
-          vector unsigned int vzero() {
-              return vec_splat_u32(0);
-          }
-          ],[
-          ],[
-          have_gcc_altivec=yes
-          ])
-          AC_MSG_RESULT($have_gcc_altivec)
-        fi
-        if test x$have_gcc_altivec = xno; then
-            AC_MSG_CHECKING(for Altivec with GCC -faltivec option)
-            altivec_CFLAGS="-faltivec"
-            CFLAGS="$save_CFLAGS $altivec_CFLAGS"
-            if test x$have_altivec_h_hdr = xyes; then
-              AC_TRY_COMPILE([
-              #include <altivec.h>
-              vector unsigned int vzero() {
-                  return vec_splat_u32(0);
-              }
-              ],[
-              ],[
-              have_gcc_altivec=yes
-              ])
-              AC_MSG_RESULT($have_gcc_altivec)
-            else
-              AC_TRY_COMPILE([
-              vector unsigned int vzero() {
-                  return vec_splat_u32(0);
-              }
-              ],[
-              ],[
-              have_gcc_altivec=yes
-              ])
-              AC_MSG_RESULT($have_gcc_altivec)
-            fi
-        fi
-        CFLAGS="$save_CFLAGS"
-        if test x$have_gcc_altivec = xyes; then
-            AC_DEFINE(SDL_ALTIVEC_BLITTERS)
-            if test x$have_altivec_h_hdr = xyes; then
-              AC_DEFINE(HAVE_ALTIVEC_H)
-            fi
-            EXTRA_CFLAGS="$EXTRA_CFLAGS $altivec_CFLAGS"
-        fi
-    fi
-}
 dnl See if GCC's -fvisibility=hidden is supported (gcc4 and later, usually).
 dnl  Details of this flag are here: http://gcc.gnu.org/wiki/Visibility
 CheckVisibilityHidden()
@@ -2043,8 +2003,6 @@ case "$host" in
        CheckDiskAudio
        CheckDummyAudio
        CheckDLOPEN
-        CheckNASM
-        CheckAltivec
        CheckOSS
        CheckDMEDIA
        CheckMME
@@ -2153,7 +2111,6 @@ case "$host" in
        CheckDummyVideo
        CheckDiskAudio
        CheckDummyAudio
-        # CheckNASM
        CheckDLOPEN
        CheckNAS
        CheckPHOTON
@@ -2197,7 +2154,6 @@ case "$host" in
        CheckWIN32
        CheckWIN32GL
        CheckDIRECTX
-        CheckNASM
        # Set up files for the video library
        if test x$enable_video = xyes; then
            AC_DEFINE(SDL_VIDEO_DRIVER_WIN32)
@@ -2278,7 +2234,6 @@ AC_HELP_STRING([--enable-render-d3d], [enable the Direct3D render driver [[defau
        CheckDummyVideo
        CheckDiskAudio
        CheckDummyAudio
-        CheckNASM
        CheckBWINDOW
        CheckBeGL
        # Set up files for the audio library
@@ -2344,7 +2299,6 @@ AC_HELP_STRING([--enable-render-d3d], [enable the Direct3D render driver [[defau
        CheckDiskAudio
        CheckDummyAudio
        CheckDLOPEN
-        CheckNASM
        # Set up files for the shared object loading library
        # (this needs to be done before the dynamic X11 check)
@@ -2359,7 +2313,6 @@ AC_HELP_STRING([--enable-render-d3d], [enable the Direct3D render driver [[defau
        CheckMacGL
        CheckOpenGLX11
        CheckPTHREAD
-        CheckAltivec
        # Good optimization on Mac OS X, yes...
        EXTRA_CFLAGS="$EXTRA_CFLAGS -falign-loops=16"

--- a/include/SDL_config.h.in
+++ b/include/SDL_config.h.in
@@ -292,7 +292,6 @@
 /* Enable assembly routines */
 #undef SDL_ASSEMBLY_ROUTINES
-#undef SDL_HERMES_BLITTERS
 #undef SDL_ALTIVEC_BLITTERS
 #endif /* _SDL_config_h */
--- a/src/hermes/COPYING.LIB
+++ b/src/hermes/COPYING.LIB
-		  GNU LIBRARY GENERAL PUBLIC LICENSE
-		       Version 2, June 1991
- Copyright (C) 1991 Free Software Foundation, Inc.
-    		    59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-[This is the first released version of the library GPL.  It is
- numbered 2 because it goes with version 2 of the ordinary GPL.]
-			    Preamble
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-Licenses are intended to guarantee your freedom to share and change
-free software--to make sure the software is free for all its users.
-  This license, the Library General Public License, applies to some
-specially designated Free Software Foundation software, and to any
-other libraries whose authors decide to use it.  You can use it for
-your libraries, too.
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-this service if you wish), that you receive source code or can get it
-if you want it, that you can change the software or use pieces of it
-in new free programs; and that you know you can do these things.
-  To protect your rights, we need to make restrictions that forbid
-anyone to deny you these rights or to ask you to surrender the rights.
-These restrictions translate to certain responsibilities for you if
-you distribute copies of the library, or if you modify it.
-  For example, if you distribute copies of the library, whether gratis
-or for a fee, you must give the recipients all the rights that we gave
-you.  You must make sure that they, too, receive or can get the source
-code.  If you link a program with the library, you must provide
-complete object files to the recipients so that they can relink them
-with the library, after making changes to the library and recompiling
-it.  And you must show them these terms so they know their rights.
-  Our method of protecting your rights has two steps: (1) copyright
-the library, and (2) offer you this license which gives you legal
-permission to copy, distribute and/or modify the library.
-  Also, for each distributor's protection, we want to make certain
-that everyone understands that there is no warranty for this free
-library.  If the library is modified by someone else and passed on, we
-want its recipients to know that what they have is not the original
-version, so that any problems introduced by others will not reflect on
-the original authors' reputations.
-  Finally, any free program is threatened constantly by software
-patents.  We wish to avoid the danger that companies distributing free
-software will individually obtain patent licenses, thus in effect
-transforming the program into proprietary software.  To prevent this,
-we have made it clear that any patent must be licensed for everyone's
-free use or not licensed at all.
-  Most GNU software, including some libraries, is covered by the ordinary
-GNU General Public License, which was designed for utility programs.  This
-license, the GNU Library General Public License, applies to certain
-designated libraries.  This license is quite different from the ordinary
-one; be sure to read it in full, and don't assume that anything in it is
-the same as in the ordinary license.
-  The reason we have a separate public license for some libraries is that
-they blur the distinction we usually make between modifying or adding to a
-program and simply using it.  Linking a program with a library, without
-changing the library, is in some sense simply using the library, and is
-analogous to running a utility program or application program.  However, in
-a textual and legal sense, the linked executable is a combined work, a
-derivative of the original library, and the ordinary General Public License
-treats it as such.
-  Because of this blurred distinction, using the ordinary General
-Public License for libraries did not effectively promote software
-sharing, because most developers did not use the libraries.  We
-concluded that weaker conditions might promote sharing better.
-  However, unrestricted linking of non-free programs would deprive the
-users of those programs of all benefit from the free status of the
-libraries themselves.  This Library General Public License is intended to
-permit developers of non-free programs to use free libraries, while
-preserving your freedom as a user of such programs to change the free
-libraries that are incorporated in them.  (We have not seen how to achieve
-this as regards changes in header files, but we have achieved it as regards
-changes in the actual functions of the Library.)  The hope is that this
-will lead to faster development of free libraries.
-  The precise terms and conditions for copying, distribution and
-modification follow.  Pay close attention to the difference between a
-"work based on the library" and a "work that uses the library".  The
-former contains code derived from the library, while the latter only
-works together with the library.
-  Note that it is possible for a library to be covered by the ordinary
-General Public License rather than by this special one.
-		  GNU LIBRARY GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-  0. This License Agreement applies to any software library which
-contains a notice placed by the copyright holder or other authorized
-party saying it may be distributed under the terms of this Library
-General Public License (also called "this License").  Each licensee is
-addressed as "you".
-  A "library" means a collection of software functions and/or data
-prepared so as to be conveniently linked with application programs
-(which use some of those functions and data) to form executables.
-  The "Library", below, refers to any such software library or work
-which has been distributed under these terms.  A "work based on the
-Library" means either the Library or any derivative work under
-copyright law: that is to say, a work containing the Library or a
-portion of it, either verbatim or with modifications and/or translated
-straightforwardly into another language.  (Hereinafter, translation is
-included without limitation in the term "modification".)
-  "Source code" for a work means the preferred form of the work for
-making modifications to it.  For a library, complete source code means
-all the source code for all modules it contains, plus any associated
-interface definition files, plus the scripts used to control compilation
-and installation of the library.
-  Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running a program using the Library is not restricted, and output from
-such a program is covered only if its contents constitute a work based
-on the Library (independent of the use of the Library in a tool for
-writing it).  Whether that is true depends on what the Library does
-and what the program that uses the Library does.
-  1. You may copy and distribute verbatim copies of the Library's
-complete source code as you receive it, in any medium, provided that
-you conspicuously and appropriately publish on each copy an
-appropriate copyright notice and disclaimer of warranty; keep intact
-all the notices that refer to this License and to the absence of any
-warranty; and distribute a copy of this License along with the
-Library.
-  You may charge a fee for the physical act of transferring a copy,
-and you may at your option offer warranty protection in exchange for a
-fee.
-  2. You may modify your copy or copies of the Library or any portion
-of it, thus forming a work based on the Library, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-    a) The modified work must itself be a software library.
-    b) You must cause the files modified to carry prominent notices
-    stating that you changed the files and the date of any change.
-    c) You must cause the whole of the work to be licensed at no
-    charge to all third parties under the terms of this License.
-    d) If a facility in the modified Library refers to a function or a
-    table of data to be supplied by an application program that uses
-    the facility, other than as an argument passed when the facility
-    is invoked, then you must make a good faith effort to ensure that,
-    in the event an application does not supply such function or
-    table, the facility still operates, and performs whatever part of
-    its purpose remains meaningful.
-    (For example, a function in a library to compute square roots has
-    a purpose that is entirely well-defined independent of the
-    application.  Therefore, Subsection 2d requires that any
-    application-supplied function or table used by this function must
-    be optional: if the application does not supply it, the square
-    root function must still compute square roots.)
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Library,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Library, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote
-it.
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Library.
-In addition, mere aggregation of another work not based on the Library
-with the Library (or with a work based on the Library) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-  3. You may opt to apply the terms of the ordinary GNU General Public
-License instead of this License to a given copy of the Library.  To do
-this, you must alter all the notices that refer to this License, so
-that they refer to the ordinary GNU General Public License, version 2,
-instead of to this License.  (If a newer version than version 2 of the
-ordinary GNU General Public License has appeared, then you can specify
-that version instead if you wish.)  Do not make any other change in
-these notices.
-  Once this change is made in a given copy, it is irreversible for
-that copy, so the ordinary GNU General Public License applies to all
-subsequent copies and derivative works made from that copy.
-  This option is useful when you wish to copy part of the code of
-the Library into a program that is not a library.
-  4. You may copy and distribute the Library (or a portion or
-derivative of it, under Section 2) in object code or executable form
-under the terms of Sections 1 and 2 above provided that you accompany
-it with the complete corresponding machine-readable source code, which
-must be distributed under the terms of Sections 1 and 2 above on a
-medium customarily used for software interchange.
-  If distribution of object code is made by offering access to copy
-from a designated place, then offering equivalent access to copy the
-source code from the same place satisfies the requirement to
-distribute the source code, even though third parties are not
-compelled to copy the source along with the object code.
-  5. A program that contains no derivative of any portion of the
-Library, but is designed to work with the Library by being compiled or
-linked with it, is called a "work that uses the Library".  Such a
-work, in isolation, is not a derivative work of the Library, and
-therefore falls outside the scope of this License.
-  However, linking a "work that uses the Library" with the Library
-creates an executable that is a derivative of the Library (because it
-contains portions of the Library), rather than a "work that uses the
-library".  The executable is therefore covered by this License.
-Section 6 states terms for distribution of such executables.
-  When a "work that uses the Library" uses material from a header file
-that is part of the Library, the object code for the work may be a
-derivative work of the Library even though the source code is not.
-Whether this is true is especially significant if the work can be
-linked without the Library, or if the work is itself a library.  The
-threshold for this to be true is not precisely defined by law.
-  If such an object file uses only numerical parameters, data
-structure layouts and accessors, and small macros and small inline
-functions (ten lines or less in length), then the use of the object
-file is unrestricted, regardless of whether it is legally a derivative
-work.  (Executables containing this object code plus portions of the
-Library will still fall under Section 6.)
-  Otherwise, if the work is a derivative of the Library, you may
-distribute the object code for the work under the terms of Section 6.
-Any executables containing that work also fall under Section 6,
-whether or not they are linked directly with the Library itself.
-  6. As an exception to the Sections above, you may also compile or
-link a "work that uses the Library" with the Library to produce a
-work containing portions of the Library, and distribute that work
-under terms of your choice, provided that the terms permit
-modification of the work for the customer's own use and reverse
-engineering for debugging such modifications.
-  You must give prominent notice with each copy of the work that the
-Library is used in it and that the Library and its use are covered by
-this License.  You must supply a copy of this License.  If the work
-during execution displays copyright notices, you must include the
-copyright notice for the Library among them, as well as a reference
-directing the user to the copy of this License.  Also, you must do one
-of these things:
-    a) Accompany the work with the complete corresponding
-    machine-readable source code for the Library including whatever
-    changes were used in the work (which must be distributed under
-    Sections 1 and 2 above); and, if the work is an executable linked
-    with the Library, with the complete machine-readable "work that
-    uses the Library", as object code and/or source code, so that the
-    user can modify the Library and then relink to produce a modified
-    executable containing the modified Library.  (It is understood
-    that the user who changes the contents of definitions files in the
-    Library will not necessarily be able to recompile the application
-    to use the modified definitions.)
-    b) Accompany the work with a written offer, valid for at
-    least three years, to give the same user the materials
-    specified in Subsection 6a, above, for a charge no more
-    than the cost of performing this distribution.
-    c) If distribution of the work is made by offering access to copy
-    from a designated place, offer equivalent access to copy the above
-    specified materials from the same place.
-    d) Verify that the user has already received a copy of these
-    materials or that you have already sent this user a copy.
-  For an executable, the required form of the "work that uses the
-Library" must include any data and utility programs needed for
-reproducing the executable from it.  However, as a special exception,
-the source code distributed need not include anything that is normally
-distributed (in either source or binary form) with the major
-components (compiler, kernel, and so on) of the operating system on
-which the executable runs, unless that component itself accompanies
-the executable.
-  It may happen that this requirement contradicts the license
-restrictions of other proprietary libraries that do not normally
-accompany the operating system.  Such a contradiction means you cannot
-use both them and the Library together in an executable that you
-distribute.
-  7. You may place library facilities that are a work based on the
-Library side-by-side in a single library together with other library
-facilities not covered by this License, and distribute such a combined
-library, provided that the separate distribution of the work based on
-the Library and of the other library facilities is otherwise
-permitted, and provided that you do these two things:
-    a) Accompany the combined library with a copy of the same work
-    based on the Library, uncombined with any other library
-    facilities.  This must be distributed under the terms of the
-    Sections above.
-    b) Give prominent notice with the combined library of the fact
-    that part of it is a work based on the Library, and explaining
-    where to find the accompanying uncombined form of the same work.
-  8. You may not copy, modify, sublicense, link with, or distribute
-the Library except as expressly provided under this License.  Any
-attempt otherwise to copy, modify, sublicense, link with, or
-distribute the Library is void, and will automatically terminate your
-rights under this License.  However, parties who have received copies,
-or rights, from you under this License will not have their licenses
-terminated so long as such parties remain in full compliance.
-  9. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Library or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Library (or any work based on the
-Library), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Library or works based on it.
-  10. Each time you redistribute the Library (or any work based on the
-Library), the recipient automatically receives a license from the
-original licensor to copy, distribute, link with or modify the Library
-subject to these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties to
-this License.
-  11. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Library at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Library by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Library.
-If any portion of this section is held invalid or unenforceable under any
-particular circumstance, the balance of the section is intended to apply,
-and the section as a whole is intended to apply in other circumstances.
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-  12. If the distribution and/or use of the Library is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Library under this License may add
-an explicit geographical distribution limitation excluding those countries,
-so that distribution is permitted only in or among countries not thus
-excluded.  In such case, this License incorporates the limitation as if
-written in the body of this License.
-  13. The Free Software Foundation may publish revised and/or new
-versions of the Library General Public License from time to time.
-Such new versions will be similar in spirit to the present version,
-but may differ in detail to address new problems or concerns.
-Each version is given a distinguishing version number.  If the Library
-specifies a version number of this License which applies to it and
-"any later version", you have the option of following the terms and
-conditions either of that version or of any later version published by
-the Free Software Foundation.  If the Library does not specify a
-license version number, you may choose any version ever published by
-the Free Software Foundation.
-  14. If you wish to incorporate parts of the Library into other free
-programs whose distribution conditions are incompatible with these,
-write to the author to ask for permission.  For software which is
-copyrighted by the Free Software Foundation, write to the Free
-Software Foundation; we sometimes make exceptions for this.  Our
-decision will be guided by the two goals of preserving the free status
-of all derivatives of our free software and of promoting the sharing
-and reuse of software generally.
-			    NO WARRANTY
-  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
-WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
-EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
-OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
-KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
-LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
-THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
-WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
-AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
-FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
-LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
-RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
-FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
-SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES.
-		     END OF TERMS AND CONDITIONS
--- a/src/hermes/HeadMMX.h
+++ b/src/hermes/HeadMMX.h
-/*
-   Header definitions for the MMX routines for the HERMES library
-   Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
-   This source code is licensed under the GNU LGPL
-   Please refer to the file COPYING.LIB contained in the distribution for
-   licensing conditions
-*/
-#include "SDL_config.h"
-#ifndef __HERMES_HEAD_MMX__
-#define __HERMES_HEAD_MMX__
-/* If you cannot stand ifdefs, then please do not look into this file, it's
-   going to end your life :) */
-#ifdef X86_ASSEMBLER
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-    void STACKCALL ConvertMMX(HermesConverterInterface *);
-    void STACKCALL ClearMMX_32(HermesClearInterface *);
-    void STACKCALL ClearMMX_24(HermesClearInterface *);
-    void STACKCALL ClearMMX_16(HermesClearInterface *);
-    void STACKCALL ClearMMX_8(HermesClearInterface *);
-    void ConvertMMXpII32_24RGB888();
-    void ConvertMMXpII32_16RGB565();
-    void ConvertMMXpII32_16BGR565();
-    void ConvertMMXpII32_16RGB555();
-    void ConvertMMXpII32_16BGR565();
-    void ConvertMMXpII32_16BGR555();
-    void ConvertMMXp32_16RGB555();
-#ifdef __cplusplus
-}
-#endif
-/* Fix the underscore business with ELF compilers */
-#if defined(__ELF__) && defined(__GNUC__)
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-    extern void _ConvertMMX(HermesConverterInterface *);
-    extern void _ConvertMMXpII32_24RGB888();
-    extern void _ConvertMMXpII32_16RGB565();
-    extern void _ConvertMMXpII32_16BGR565();
-    extern void _ConvertMMXpII32_16RGB555();
-    extern void _ConvertMMXpII32_16BGR555();
-#define ConvertMMX _ConvertMMX
-#define ConvertMMXpII32_24RGB888 _ConvertMMXpII32_24RGB888
-#define ConvertMMXpII32_16RGB565 _ConvertMMXpII32_16RGB565
-#define ConvertMMXpII32_16BGR565 _ConvertMMXpII32_16BGR565
-#define ConvertMMXpII32_16RGB555 _ConvertMMXpII32_16RGB555
-#define ConvertMMXpII32_16BGR555 _ConvertMMXpII32_16BGR555
-#ifdef __cplusplus
-}
-#endif
-#endif                          /* ELF and GNUC */
-/* Make it work with Watcom */
-#ifdef __WATCOMC__
-#pragma warning 601 9
-#pragma aux ConvertMMX "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearMMX_32 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearMMX_24 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearMMX_16 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearMMX_8 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ConvertMMXpII32_24RGB888 "_*"
-#pragma aux ConvertMMXpII32_16RGB565 "_*"
-#pragma aux ConvertMMXpII32_16BGR565 "_*"
-#pragma aux ConvertMMXpII32_16RGB555 "_*"
-#pragma aux ConvertMMXpII32_16BGR555 "_*"
-#pragma aux ConvertMMXp32_16RGB555 "_*"
-#endif                          /* WATCOM */
-#endif                          /* X86_ASSEMBLER */
-#endif
-/* vi: set ts=4 sw=4 expandtab: */
--- a/src/hermes/HeadX86.h
+++ b/src/hermes/HeadX86.h
-/*
-   Header definitions for the x86 routines for the HERMES library
-   Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
-   This source code is licensed under the GNU LGPL
-   Please refer to the file COPYING.LIB contained in the distribution for
-   licensing conditions
-*/
-#ifndef __HERMES_HEAD_X86__
-#define __HERMES_HEAD_X86__
-#ifdef X86_ASSEMBLER
-/* If you can't stand IFDEFS, then close your eyes now, please :) */
-/* Ok, we start with normal function definitions */
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-    void STACKCALL ConvertX86(HermesConverterInterface *);
-    void STACKCALL ClearX86_32(HermesClearInterface *);
-    void STACKCALL ClearX86_24(HermesClearInterface *);
-    void STACKCALL ClearX86_16(HermesClearInterface *);
-    void STACKCALL ClearX86_8(HermesClearInterface *);
-    int STACKCALL Hermes_X86_CPU();
-    void ConvertX86p32_32BGR888();
-    void ConvertX86p32_32RGBA888();
-    void ConvertX86p32_32BGRA888();
-    void ConvertX86p32_24RGB888();
-    void ConvertX86p32_24BGR888();
-    void ConvertX86p32_16RGB565();
-    void ConvertX86p32_16BGR565();
-    void ConvertX86p32_16RGB555();
-    void ConvertX86p32_16BGR555();
-    void ConvertX86p32_8RGB332();
-    void ConvertX86p16_32RGB888();
-    void ConvertX86p16_32BGR888();
-    void ConvertX86p16_32RGBA888();
-    void ConvertX86p16_32BGRA888();
-    void ConvertX86p16_24RGB888();
-    void ConvertX86p16_24BGR888();
-    void ConvertX86p16_16BGR565();
-    void ConvertX86p16_16RGB555();
-    void ConvertX86p16_16BGR555();
-    void ConvertX86p16_8RGB332();
-    void CopyX86p_4byte();
-    void CopyX86p_3byte();
-    void CopyX86p_2byte();
-    void CopyX86p_1byte();
-    void ConvertX86pI8_32();
-    void ConvertX86pI8_24();
-    void ConvertX86pI8_16();
-    extern int ConvertX86p16_32RGB888_LUT_X86[512];
-    extern int ConvertX86p16_32BGR888_LUT_X86[512];
-    extern int ConvertX86p16_32RGBA888_LUT_X86[512];
-    extern int ConvertX86p16_32BGRA888_LUT_X86[512];
-#ifdef __cplusplus
-}
-#endif
-/* Now fix up the ELF underscore problem */
-#if defined(__ELF__) && defined(__GNUC__)
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-    extern int _Hermes_X86_CPU();
-    extern void _ConvertX86(HermesConverterInterface *);
-    extern void _ConvertX86p32_32BGR888();
-    extern void _ConvertX86p32_32RGBA888();
-    extern void _ConvertX86p32_32BGRA888();
-    extern void _ConvertX86p32_24RGB888();
-    extern void _ConvertX86p32_24BGR888();
-    extern void _ConvertX86p32_16RGB565();
-    extern void _ConvertX86p32_16BGR565();
-    extern void _ConvertX86p32_16RGB555();
-    extern void _ConvertX86p32_16BGR555();
-    extern void _ConvertX86p32_8RGB332();
-    extern void _ConvertX86p16_16BGR565();
-    extern void _ConvertX86p16_16RGB555();
-    extern void _ConvertX86p16_16BGR555();
-    extern void _ConvertX86p16_8RGB332();
-#define Hermes_X86_CPU _Hermes_X86_CPU
-#define ConvertX86 _ConvertX86
-#define ConvertX86p32_32BGR888 _ConvertX86p32_32BGR888
-#define ConvertX86p32_32RGBA888 _ConvertX86p32_32RGBA888
-#define ConvertX86p32_32BGRA888 _ConvertX86p32_32BGRA888
-#define ConvertX86p32_24RGB888 _ConvertX86p32_24RGB888
-#define ConvertX86p32_24BGR888 _ConvertX86p32_24BGR888
-#define ConvertX86p32_16RGB565 _ConvertX86p32_16RGB565
-#define ConvertX86p32_16BGR565 _ConvertX86p32_16BGR565
-#define ConvertX86p32_16RGB555 _ConvertX86p32_16RGB555
-#define ConvertX86p32_16BGR555 _ConvertX86p32_16BGR555
-#define ConvertX86p32_8RGB332 _ConvertX86p32_8RGB332
-#define ConvertX86p16_16BGR565 _ConvertX86p16_16BGR565
-#define ConvertX86p16_16RGB555 _ConvertX86p16_16RGB555
-#define ConvertX86p16_16BGR555 _ConvertX86p16_16BGR555
-#define ConvertX86p16_8RGB332 _ConvertX86p16_8RGB332
-#ifdef __cplusplus
-}
-#endif
-#endif                          /* ELF & GNU */
-/* Make it run with WATCOM C */
-#ifdef __WATCOMC__
-#pragma warning 601 9
-#pragma aux Hermes_X86_CPU "_*"
-#pragma aux ConvertX86 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearX86_32 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearX86_24 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearX86_16 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ClearX86_8 "_*" modify [EAX EBX ECX EDX ESI EDI]
-#pragma aux ConvertX86p32_32BGR888 "_*"
-#pragma aux ConvertX86p32_32RGBA888 "_*"
-#pragma aux ConvertX86p32_32BGRA888 "_*"
-#pragma aux ConvertX86p32_24RGB888 "_*"
-#pragma aux ConvertX86p32_24BGR888 "_*"
-#pragma aux ConvertX86p32_16RGB565 "_*"
-#pragma aux ConvertX86p32_16BGR565 "_*"
-#pragma aux ConvertX86p32_16RGB555 "_*"
-#pragma aux ConvertX86p32_16BGR555 "_*"
-#pragma aux ConvertX86p32_8RGB332 "_*"
-#pragma aux ConvertX86p16_32RGB888 "_*"
-#pragma aux ConvertX86p16_32BGR888 "_*"
-#pragma aux ConvertX86p16_32RGBA888 "_*"
-#pragma aux ConvertX86p16_32BGRA888 "_*"
-#pragma aux ConvertX86p16_24RGB888 "_*"
-#pragma aux ConvertX86p16_24BGR888 "_*"
-#pragma aux ConvertX86p16_16BGR565 "_*"
-#pragma aux ConvertX86p16_16RGB555 "_*"
-#pragma aux ConvertX86p16_16BGR555 "_*"
-#pragma aux ConvertX86p16_8RGB332 "_*"
-#pragma aux CopyX86p_4byte "_*"
-#pragma aux CopyX86p_3byte "_*"
-#pragma aux CopyX86p_2byte "_*"
-#pragma aux CopyX86p_1byte "_*"
-#pragma aux ConvertX86pI8_32 "_*"
-#pragma aux ConvertX86pI8_24 "_*"
-#pragma aux ConvertX86pI8_16 "_*"
-#pragma aux ConvertX86p16_32RGB888_LUT_X86 "_*"
-#pragma aux ConvertX86p16_32BGR888_LUT_X86 "_*"
-#pragma aux ConvertX86p16_32RGBA888_LUT_X86 "_*"
-#pragma aux ConvertX86p16_32BGRA888_LUT_X86 "_*"
-#endif                          /* __WATCOMC__ */
-#endif                          /* X86_ASSEMBLER */
-#endif
-/* vi: set ts=4 sw=4 expandtab: */
--- a/src/hermes/README
+++ b/src/hermes/README
-HERMES 1.2.4 (c)1998 Christian Nentwich (brn) (c.nentwich@cs.ucl.ac.uk)
-and quite a few assembler routines (c) Glenn Fielder (gaffer@gaffer.org)
-This library and all the files enclosed in this package are free software
-under the terms of the GNU Library General Public License (LGPL). Please
-refer to the included file COPYING.LIB for the exact terms.
----------------------------------------------------------------------------
-This is a stripped down version of HERMES, including only the x86 assembler
-converters, for use with Simple DirectMedia Layer.
-The full HERMES library is available at:  http://hermes.terminal.at/
--- a/src/hermes/common.inc
+++ b/src/hermes/common.inc
-; Some common macros for hermes nasm code
-%macro SDL_FUNC 1
-%ifdef HIDDEN_VISIBILITY
-GLOBAL %1:function hidden
-%else
-GLOBAL %1
-%endif
-%endmacro
--- a/src/hermes/mmx_main.asm
+++ b/src/hermes/mmx_main.asm
-;
-; mmx format converter main loops for HERMES
-; Some routines Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
-; This source code is licensed under the GNU LGPL
-; 
-; Please refer to the file COPYING.LIB contained in the distribution for
-; licensing conditions		
-;
-BITS 32
-%include "common.inc"
-SDL_FUNC _ConvertMMX
-SECTION .text
-;; _ConvertMMX:	 
-;; [ESP+8] ConverterInfo*
-;; --------------------------------------------------------------------------
-;; ConverterInfo (ebp+..)
-;;   0:	void *s_pixels
-;;   4:	int s_width
-;;   8:	int s_height
-;;  12:	int s_add
-;;  16:	void *d_pixels
-;;  20:	int d_width
-;;  24:	int d_height
-;;  28:	int d_add
-;;  32:	void (*converter_function)() 
-;;  36: int32 *lookup
-_ConvertMMX:
-	push ebp
-	mov ebp,esp
-; Save the registers used by the blitters, necessary for optimized code
-	pusha
-	mov eax,[ebp+8]
-        cmp dword [eax+4],BYTE 0
-	je endconvert
-	mov ebp,eax
-	mov esi,[ebp+0]
-	mov edi,[ebp+16]
-y_loop:	
-	mov ecx,[ebp+4]
-	call [ebp+32]
-	add esi,[ebp+12]
-	add edi,[ebp+28]
-	dec dword  [ebp+8]
-	jnz y_loop
-; Restore the registers used by the blitters, necessary for optimized code
-	popa
-	pop ebp
-endconvert:
-	emms
-	ret		
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
--- a/src/hermes/mmxp2_32.asm
+++ b/src/hermes/mmxp2_32.asm
-;
-; pII-optimised MMX format converters for HERMES
-; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
-;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
-; This source code is licensed under the GNU LGPL
-; 
-; Please refer to the file COPYING.LIB contained in the distribution for
-; licensing conditions		
-;
-; COPYRIGHT NOTICE
-; 
-; This file partly contains code that is (c) Intel Corporation, specifically
-; the mode detection routine, and the converter to 15 bit (8 pixel
-; conversion routine from the mmx programming tutorial pages).
-;
-;
-; These routines aren't exactly pII optimised - it's just that as they
-; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
-; optimise them for p5 MMXs..
-BITS 32
-%include "common.inc"
-SDL_FUNC _ConvertMMXpII32_24RGB888
-SDL_FUNC _ConvertMMXpII32_16RGB565
-SDL_FUNC _ConvertMMXpII32_16BGR565
-SDL_FUNC _ConvertMMXpII32_16RGB555
-SDL_FUNC _ConvertMMXpII32_16BGR555
-;; Macros for conversion routines
-%macro _push_immq_mask 1
-	push dword %1
-	push dword %1
-%endmacro
-%macro load_immq 2
-	_push_immq_mask %2
-	movq %1, [esp]
-%endmacro
-%macro pand_immq 2
-	_push_immq_mask %2
-	pand %1, [esp]
-%endmacro
-%define CLEANUP_IMMQ_LOADS(num) \
-	add esp, byte 8 * num
-%define mmx32_rgb888_mask 00ffffffh
-%define mmx32_rgb565_b 000000f8h
-%define mmx32_rgb565_g 0000fc00h
-%define mmx32_rgb565_r 00f80000h
-%define mmx32_rgb555_rb 00f800f8h
-%define mmx32_rgb555_g 0000f800h
-%define mmx32_rgb555_mul 20000008h
-%define mmx32_bgr555_mul 00082000h
-SECTION .text
-_ConvertMMXpII32_24RGB888:
-        ; set up mm6 as the mask, mm7 as zero
-        load_immq mm6, mmx32_rgb888_mask
-        CLEANUP_IMMQ_LOADS(1)
-        pxor mm7, mm7
-        mov edx, ecx                    ; save ecx
-        and ecx, 0fffffffch             ; clear lower two bits
-        jnz .L1
-        jmp .L2
-.L1:
-        movq mm0, [esi]                 ; A R G B a r g b
-        pand mm0, mm6                   ; 0 R G B 0 r g b
-        movq mm1, [esi+8]               ; A R G B a r g b
-        pand mm1, mm6                   ; 0 R G B 0 r g b
-        movq mm2, mm0                   ; 0 R G B 0 r g b
-        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
-        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
-        psllq mm2, 24                   ; 0 0 R G B 0 0 0
-        por mm0, mm2                    ; 0 0 R G B r g b
-        movq mm3, mm1                   ; 0 R G B 0 r g b
-        psllq mm3, 48                   ; g b 0 0 0 0 0 0
-        por mm0, mm3                    ; g b R G B r g b
-        movq mm4, mm1                   ; 0 R G B 0 r g b
-        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
-        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
-        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
-        psllq mm4, 8                    ; 0 0 0 0 R G B 0
-        por mm1, mm4                    ; 0 0 0 0 R G B r
-        movq [edi], mm0
-        add esi, BYTE 16
-        movd [edi+8], mm1
-        add edi, BYTE 12
-        sub ecx, BYTE 4
-        jnz .L1
-.L2:
-        mov ecx, edx
-        and ecx, BYTE 3
-        jz .L4
-.L3:
-        mov al, [esi]
-        mov bl, [esi+1]
-        mov dl, [esi+2]
-        mov [edi], al
-        mov [edi+1], bl
-        mov [edi+2], dl
-        add esi, BYTE 4
-        add edi, BYTE 3
-        dec ecx
-        jnz .L3
-.L4:
-        return
-_ConvertMMXpII32_16RGB565:
-        ; set up masks
-        load_immq mm5, mmx32_rgb565_b
-        load_immq mm6, mmx32_rgb565_g
-        load_immq mm7, mmx32_rgb565_r
-        CLEANUP_IMMQ_LOADS(3)
-        mov edx, ecx
-        shr ecx, 2
-        jnz .L1
-        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
-.L1:
-        movq mm0, [esi]         ; argb
-        movq mm1, mm0           ; argb
-        pand mm0, mm6           ; 00g0
-        movq mm3, mm1           ; argb
-        pand mm1, mm5           ; 000b
-        pand mm3, mm7           ; 0r00
-        pslld mm1, 2            ; 0 0 000000bb bbb00000
-        por mm0, mm1            ; 0 0 ggggggbb bbb00000
-        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
-        movq mm4, [esi+8]       ; argb
-        movq mm2, mm4           ; argb
-        pand mm4, mm6           ; 00g0
-        movq mm1, mm2           ; argb
-        pand mm2, mm5           ; 000b
-        pand mm1, mm7           ; 0r00
-        pslld mm2, 2            ; 0 0 000000bb bbb00000
-        por mm4, mm2            ; 0 0 ggggggbb bbb00000
-        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
-        packuswb mm3, mm1       ; R 0 r 0
-        packssdw mm0, mm4       ; as above.. ish
-        por mm0, mm3            ; done.
-        movq [edi], mm0
-        add esi, 16
-        add edi, 8
-        dec ecx
-        jnz .L1
-.L2:
-        mov ecx, edx
-        and ecx, BYTE 3
-        jz .L4
-.L3:
-        mov al, [esi]
-        mov bh, [esi+1]
-        mov ah, [esi+2]
-        shr al, 3
-        and eax, 0F81Fh            ; BYTE?
-        shr ebx, 5
-        and ebx, 07E0h             ; BYTE?
-        add eax, ebx
-        mov [edi], al
-        mov [edi+1], ah
-        add esi, BYTE 4
-        add edi, BYTE 2
-        dec ecx
-        jnz .L3
-.L4:
-	retn
-_ConvertMMXpII32_16BGR565:
-        load_immq mm5, mmx32_rgb565_r
-        load_immq mm6, mmx32_rgb565_g
-        load_immq mm7, mmx32_rgb565_b
-        CLEANUP_IMMQ_LOADS(3)
-        mov edx, ecx
-        shr ecx, 2
-        jnz .L1
-        jmp .L2
-.L1:
-        movq mm0, [esi]                 ; a r g b
-        movq mm1, mm0                   ; a r g b
-        pand mm0, mm6                   ; 0 0 g 0
-        movq mm3, mm1                   ; a r g b
-        pand mm1, mm5                   ; 0 r 0 0
-        pand mm3, mm7                   ; 0 0 0 b
-        psllq mm3, 16                   ; 0 b 0 0
-        psrld mm1, 14                   ; 0 0 000000rr rrr00000
-        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
-        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
-        movq mm4, [esi+8]               ; a r g b
-        movq mm2, mm4                   ; a r g b
-        pand mm4, mm6                   ; 0 0 g 0
-        movq mm1, mm2                   ; a r g b
-        pand mm2, mm5                   ; 0 r 0 0
-        pand mm1, mm7                   ; 0 0 0 b
-        psllq mm1, 16                   ; 0 b 0 0
-        psrld mm2, 14                   ; 0 0 000000rr rrr00000
-        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
-        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
-        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
-        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
-        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
-        movq [edi], mm0
-        add esi, BYTE 16
-        add edi, BYTE 8
-        dec ecx
-        jnz .L1
-.L2:
-        and edx, BYTE 3
-        jz .L4
-.L3:
-        mov al, [esi+2]
-        mov bh, [esi+1]
-        mov ah, [esi]
-        shr al, 3
-        and eax, 0F81Fh                    ; BYTE ?
-        shr ebx, 5
-        and ebx, 07E0h                     ; BYTE ?
-        add eax, ebx
-        mov [edi], al
-        mov [edi+1], ah
-        add esi, BYTE 4
-        add edi, BYTE 2
-        dec edx
-        jnz .L3
-.L4:
-        retn
-_ConvertMMXpII32_16BGR555:
-        ; the 16BGR555 converter is identical to the RGB555 one,
-        ; except it uses a different multiplier for the pmaddwd
-        ; instruction.  cool huh.
-        load_immq mm7, mmx32_bgr555_mul
-        jmp _convert_bgr555_cheat
-; This is the same as the Intel version.. they obviously went to
-; much more trouble to expand/coil the loop than I did, so theirs
-; would almost certainly be faster, even if only a little.
-; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
-; (I think) a more accurate name..
-_ConvertMMXpII32_16RGB555:
-	load_immq mm7, mmx32_rgb555_mul
-_convert_bgr555_cheat:
-	load_immq mm6, mmx32_rgb555_g
-	CLEANUP_IMMQ_LOADS(2)
-	mov edx,ecx		           ; Save ecx 
-        and ecx,DWORD 0fffffff8h            ; clear lower three bits
-	jnz .L_OK
-        jmp near .L2 
-.L_OK:
-	movq mm2,[esi+8]
-	movq mm0,[esi]
-	movq mm3,mm2
-	pand_immq mm3, mmx32_rgb555_rb
-	movq mm1,mm0
-	pand_immq mm1, mmx32_rgb555_rb
-	pmaddwd mm3,mm7
-	CLEANUP_IMMQ_LOADS(2)
-	pmaddwd mm1,mm7
-	pand mm2,mm6
-.L1:
-	movq mm4,[esi+24]
-	pand mm0,mm6
-	movq mm5,[esi+16]
-	por mm3,mm2
-	psrld mm3,6
-	por mm1,mm0
-	movq mm0,mm4
-	psrld mm1,6
-	pand_immq mm0, mmx32_rgb555_rb
-	packssdw mm1,mm3
-	movq mm3,mm5
-	pmaddwd mm0,mm7
-	pand_immq mm3, mmx32_rgb555_rb
-	pand mm4,mm6
-	movq [edi],mm1			
-	pmaddwd mm3,mm7
-        add esi,BYTE 32
-	por mm4,mm0
-	pand mm5,mm6
-	psrld mm4,6
-	movq mm2,[esi+8]
-	por mm5,mm3
-	movq mm0,[esi]
-	psrld mm5,6
-	movq mm3,mm2
-	movq mm1,mm0
-	pand_immq mm3, mmx32_rgb555_rb
-	packssdw mm5,mm4
-	pand_immq mm1, mmx32_rgb555_rb
-	pand mm2,mm6
-	CLEANUP_IMMQ_LOADS(4)
-	movq [edi+8],mm5
-	pmaddwd mm3,mm7
-	pmaddwd mm1,mm7
-        add edi,BYTE 16
-        sub ecx,BYTE 8
-	jz .L2
-        jmp .L1
-.L2:	
-	mov ecx,edx
-        and ecx,BYTE 7
-	jz .L4
-.L3:	
-	mov ebx,[esi]
-        add esi,BYTE 4
-        mov eax,ebx
-        mov edx,ebx
-        shr eax,3
-        shr edx,6
-        and eax,BYTE 0000000000011111b
-        and edx,     0000001111100000b
-        shr ebx,9
-        or eax,edx
-        and ebx,     0111110000000000b
-        or eax,ebx
-        mov [edi],ax
-        add edi,BYTE 2
-	dec ecx
-	jnz .L3	
-.L4:		
-	retn
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
--- a/src/hermes/x86_main.asm
+++ b/src/hermes/x86_main.asm
-;
-; x86 format converters for HERMES
-; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
-; This source code is licensed under the GNU LGPL
-; 
-; Please refer to the file COPYING.LIB contained in the distribution for
-; licensing conditions		
-;
-; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
-; 
-BITS 32
-%include "common.inc"
-SDL_FUNC _ConvertX86
-SECTION .text
-;; _ConvertX86:	 
-;; [ESP+8] ConverterInfo*
-;; --------------------------------------------------------------------------
-;; ConverterInfo (ebp+..)
-;;   0:	void *s_pixels
-;;   4:	int s_width
-;;   8:	int s_height
-;;  12:	int s_add
-;;  16:	void *d_pixels
-;;  20:	int d_width
-;;  24:	int d_height
-;;  28:	int d_add
-;;  32:	void (*converter_function)() 
-;;  36: int32 *lookup
-_ConvertX86:
-	push ebp
-	mov ebp,esp
-; Save the registers used by the blitters, necessary for optimized code
-	pusha
-	mov eax,[ebp+8]
-        cmp dword [eax+4],BYTE 0
-	je endconvert
-	mov ebp,eax
-	mov esi,[ebp+0]
-	mov edi,[ebp+16]
-y_loop:	
-	mov ecx,[ebp+4]
-	call [ebp+32]
-	add esi,[ebp+12]
-	add edi,[ebp+28]
-	dec dword  [ebp+8]
-	jnz y_loop
-; Restore the registers used by the blitters, necessary for optimized code
-	popa
-	pop ebp
-endconvert:	
-	ret		
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
--- a/src/hermes/x86p_16.asm
+++ b/src/hermes/x86p_16.asm
-;
-; x86 format converters for HERMES
-; Copyright (c) 1998 Glenn Fielder (gaffer@gaffer.org)
-; This source code is licensed under the GNU LGPL
-; 
-; Please refer to the file COPYING.LIB contained in the distribution for
-; licensing conditions		
-; 
-; Routines adjusted for Hermes by Christian Nentwich (brn@eleet.mcb.at)
-; Used with permission.
-; 
-BITS 32
-%include "common.inc"
-SDL_FUNC _ConvertX86p16_16BGR565
-SDL_FUNC _ConvertX86p16_16RGB555
-SDL_FUNC _ConvertX86p16_16BGR555
-SDL_FUNC _ConvertX86p16_8RGB332
-EXTERN _ConvertX86
-SECTION .text
-_ConvertX86p16_16BGR565:
-    ; check short
-    cmp ecx,BYTE 16
-    ja .L3
-.L1 ; short loop
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    and ebx,11111100000b
-    shl edx,11
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-    jnz .L1
-.L2
-    retn
-.L3 ; head
-    mov eax,edi
-    and eax,BYTE 11b
-    jz .L4
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    and ebx,11111100000b
-    shl edx,11
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-.L4 ; save count
-    push ecx
-    ; unroll twice
-    shr ecx,1
-    ; point arrays to end
-    lea esi,[esi+ecx*4]
-    lea edi,[edi+ecx*4]
-    ; negative counter 
-    neg ecx
-    jmp SHORT .L6
-.L5     mov [edi+ecx*4-4],eax
-.L6     mov eax,[esi+ecx*4]
-        mov ebx,[esi+ecx*4]
-        and eax,07E007E0h         
-        mov edx,[esi+ecx*4]
-        and ebx,0F800F800h
-        shr ebx,11
-        and edx,001F001Fh
-        shl edx,11
-        add eax,ebx
-        add eax,edx                 
-        inc ecx
-        jnz .L5                 
-    mov [edi+ecx*4-4],eax
-    ; tail
-    pop ecx
-    and ecx,BYTE 1
-    jz .L7
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    and ebx,11111100000b
-    shl edx,11
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-.L7
-    retn
-_ConvertX86p16_16RGB555:
-    ; check short
-    cmp ecx,BYTE 32
-    ja .L3
-.L1 ; short loop
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    shr ebx,1
-    and ebx,     0111111111100000b
-    and eax,BYTE 0000000000011111b
-    add eax,ebx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-    jnz .L1
-.L2
-    retn
-.L3 ; head
-    mov eax,edi
-    and eax,BYTE 11b
-    jz .L4
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    shr ebx,1
-    and ebx,     0111111111100000b
-    and eax,BYTE 0000000000011111b
-    add eax,ebx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-.L4 ; save ebp
-    push ebp
-    ; save count
-    push ecx
-    ; unroll four times
-    shr ecx,2
-    ; point arrays to end
-    lea esi,[esi+ecx*8]
-    lea edi,[edi+ecx*8]
-    ; negative counter 
-    xor ebp,ebp
-    sub ebp,ecx
-.L5     mov eax,[esi+ebp*8]        ; agi?
-        mov ecx,[esi+ebp*8+4]
-        mov ebx,eax
-        mov edx,ecx
-        and eax,0FFC0FFC0h
-        and ecx,0FFC0FFC0h
-        shr eax,1
-        and ebx,001F001Fh
-        shr ecx,1
-        and edx,001F001Fh
-        add eax,ebx
-        add ecx,edx
-        mov [edi+ebp*8],eax
-        mov [edi+ebp*8+4],ecx
-        inc ebp
-        jnz .L5                 
-    ; tail
-    pop ecx
-.L6 and ecx,BYTE 11b
-    jz .L7
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    shr ebx,1
-    and ebx,     0111111111100000b
-    and eax,BYTE 0000000000011111b
-    add eax,ebx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-    jmp SHORT .L6
-.L7 pop ebp
-    retn
-_ConvertX86p16_16BGR555:
-    ; check short
-    cmp ecx,BYTE 16
-    ja .L3
-.L1 ; short loop
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    shr ebx,1
-    and ebx,1111100000b
-    shl edx,10
-    and edx,0111110000000000b
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-    jnz .L1
-.L2
-    retn
-.L3 ; head
-    mov eax,edi
-    and eax,BYTE 11b
-    jz .L4
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    shr ebx,1
-    and ebx,1111100000b
-    shl edx,10
-    and edx,0111110000000000b
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-    dec ecx
-.L4 ; save count
-    push ecx
-    ; unroll twice
-    shr ecx,1
-    ; point arrays to end
-    lea esi,[esi+ecx*4]
-    lea edi,[edi+ecx*4]
-    ; negative counter 
-    neg ecx
-    jmp SHORT .L6
-.L5     mov [edi+ecx*4-4],eax
-.L6     mov eax,[esi+ecx*4]
-        shr eax,1
-        mov ebx,[esi+ecx*4]
-        and eax,03E003E0h         
-        mov edx,[esi+ecx*4]
-        and ebx,0F800F800h
-        shr ebx,11
-        and edx,001F001Fh
-        shl edx,10
-        add eax,ebx
-        add eax,edx                 
-        inc ecx
-        jnz .L5                 
-    mov [edi+ecx*4-4],eax
-    ; tail
-    pop ecx
-    and ecx,BYTE 1
-    jz .L7
-    mov al,[esi]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    shr eax,11
-    and eax,BYTE 11111b
-    shr ebx,1
-    and ebx,1111100000b
-    shl edx,10
-    and edx,0111110000000000b
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    mov [edi+1],ah
-    add esi,BYTE 2
-    add edi,BYTE 2
-.L7
-    retn
-_ConvertX86p16_8RGB332:
-    ; check short
-    cmp ecx,BYTE 16
-    ja .L3
-.L1 ; short loop
-    mov al,[esi+0]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    and eax,BYTE 11000b         ; blue
-    shr eax,3
-    and ebx,11100000000b        ; green
-    shr ebx,6
-    and edx,1110000000000000b   ; red
-    shr edx,8
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    add esi,BYTE 2
-    inc edi
-    dec ecx
-    jnz .L1
-.L2
-    retn
-.L3 mov eax,edi
-    and eax,BYTE 11b
-    jz .L4
-    mov al,[esi+0]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    and eax,BYTE 11000b         ; blue
-    shr eax,3
-    and ebx,11100000000b        ; green
-    shr ebx,6
-    and edx,1110000000000000b   ; red
-    shr edx,8
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    add esi,BYTE 2
-    inc edi
-    dec ecx
-    jmp SHORT .L3
-.L4 ; save ebp
-    push ebp
-    ; save count
-    push ecx
-    ; unroll 4 times
-    shr ecx,2
-    ; prestep
-    mov dl,[esi+0]
-    mov bl,[esi+1]
-    mov dh,[esi+2]
-.L5     shl edx,16
-        mov bh,[esi+3]
-        shl ebx,16
-        mov dl,[esi+4]
-        mov dh,[esi+6]
-        mov bl,[esi+5]
-        and edx,00011000000110000001100000011000b
-        mov bh,[esi+7]
-        ror edx,16+3
-        mov eax,ebx                                     ; setup eax for reds
-        and ebx,00000111000001110000011100000111b
-        and eax,11100000111000001110000011100000b       ; reds
-        ror ebx,16-2
-        add esi,BYTE 8
-        ror eax,16
-        add edi,BYTE 4
-        add eax,ebx
-        mov bl,[esi+1]                                  ; greens
-        add eax,edx
-        mov dl,[esi+0]                                  ; blues
-        mov [edi-4],eax
-        mov dh,[esi+2]
-        dec ecx
-        jnz .L5                 
-    ; check tail
-    pop ecx
-    and ecx,BYTE 11b
-    jz .L7
-.L6 ; tail
-    mov al,[esi+0]
-    mov ah,[esi+1]
-    mov ebx,eax
-    mov edx,eax
-    and eax,BYTE 11000b         ; blue
-    shr eax,3
-    and ebx,11100000000b        ; green
-    shr ebx,6
-    and edx,1110000000000000b   ; red
-    shr edx,8
-    add eax,ebx
-    add eax,edx
-    mov [edi],al
-    add esi,BYTE 2
-    inc edi
-    dec ecx
-    jnz .L6
-.L7 pop ebp
-    retn
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
--- a/src/hermes/x86p_32.asm
+++ b/src/hermes/x86p_32.asm
-;
-; x86 format converters for HERMES
-; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
-; This source code is licensed under the GNU LGPL
-; 
-; Please refer to the file COPYING.LIB contained in the distribution for
-; licensing conditions		
-;
-; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
-; 
-BITS 32
-%include "common.inc"
-SDL_FUNC _ConvertX86p32_32BGR888
-SDL_FUNC _ConvertX86p32_32RGBA888
-SDL_FUNC _ConvertX86p32_32BGRA888
-SDL_FUNC _ConvertX86p32_24RGB888	
-SDL_FUNC _ConvertX86p32_24BGR888
-SDL_FUNC _ConvertX86p32_16RGB565
-SDL_FUNC _ConvertX86p32_16BGR565
-SDL_FUNC _ConvertX86p32_16RGB555
-SDL_FUNC _ConvertX86p32_16BGR555
-SDL_FUNC _ConvertX86p32_8RGB332
-SECTION .text
-;; _Convert_*
-;; Paramters:	
-;;   ESI = source 
-;;   EDI = dest
-;;   ECX = amount (NOT 0!!! (the _ConvertX86 routine checks for that though))
-;; Destroys:
-;;   EAX, EBX, EDX
-_ConvertX86p32_32BGR888:
-    ; check short
-    cmp ecx,BYTE 32
-    ja .L3
-.L1 ; short loop
-    mov edx,[esi]
-    bswap edx
-    ror edx,8
-    mov [edi],edx
-    add esi,BYTE 4
-    add edi,BYTE 4
-    dec ecx
-    jnz .L1
-.L2
-    retn
-.L3 ; save ebp
-    push ebp
-    ; unroll four times
-    mov ebp,ecx
-    shr ebp,2
-    ; save count
-    push ecx
-.L4     mov eax,[esi]
-        mov ebx,[esi+4]
-        bswap eax
-        bswap ebx
-        ror eax,8
-        mov ecx,[esi+8]
-        ror ebx,8
-        mov edx,[esi+12]
-        bswap ecx
-        bswap edx
-        ror ecx,8
-        mov [edi+0],eax
-        ror edx,8
-        mov [edi+4],ebx
-        mov [edi+8],ecx
-        mov [edi+12],edx
-        add esi,BYTE 16
-        add edi,BYTE 16
-        dec ebp
-        jnz .L4                 
-    ; check tail
-    pop ecx
-    and ecx,BYTE 11b
-    jz .L6
-.L5 ; tail loop
-    mov edx,[esi]
-    bswap edx
-    ror edx,8
-    mov [edi],edx
-    add esi,BYTE 4
-    add edi,BYTE 4
-    dec ecx
-    jnz .L5
-.L6 pop ebp
-    retn
-_ConvertX86p32_32RGBA888:
-    ; check short
-    cmp ecx,BYTE 32
-    ja .L3
-.L1 ; short loop
-    mov edx,[esi]
-    rol edx,8
-    mov [edi],edx
-    add esi,BYTE 4
-    add edi,BYTE 4
-    dec ecx
-    jnz .L1
-.L2
-    retn
-.L3 ; save ebp
-    push ebp
-    ; unroll four times
-    mov ebp,ecx
-    shr ebp,2
-    ; save count
-    push ecx
-.L4     mov eax,[esi]
-        mov ebx,[esi+4]
-        rol eax,8
-        mov ecx,[esi+8]
-        rol ebx,8
-        mov edx,[esi+12]
-        rol ecx,8
-        mov [edi+0],eax
-        rol edx,8
-        mov [edi+4],ebx
-        mov [edi+8],ecx
-        mov [edi+12],edx
-        add esi,BYTE 16
-        add edi,BYTE 16
-        dec ebp
-        jnz .L4                 
-    ; check tail
-    pop ecx
-    and ecx,BYTE 11b
-    jz .L6
-.L5 ; tail loop
-    mov edx,[esi]
-    rol edx,8
-    mov [edi],edx
-    add esi,BYTE 4
-    add edi,BYTE 4
-    dec ecx
-    jnz .L5
-.L6 pop ebp
-    retn
-_ConvertX86p32_32BGRA888:
-    ; check short
-    cmp ecx,BYTE 32
-    ja .L3
-.L1 ; short loop
-    mov edx,[esi]
-    bswap edx
-    mov [edi],edx
-    add esi,BYTE 4
-    add edi,BYTE 4
-    dec ecx
-    jnz .L1
-.L2
-    retn
-.L3 ; save ebp
-    push ebp
-    ; unroll four times
-    mov ebp,ecx
-    shr ebp,2
-    ; save count
-    push ecx
-.L4     mov eax,[esi]
-        mov ebx,[esi+4]
-        mov ecx,[esi+8]
-        mov edx,[esi+12]
-        bswap eax
-        bswap ebx
-        bswap ecx
-        bswap edx
-        mov [edi+0],eax
-        mov [edi+4],ebx
-        mov [edi+8],ecx
-        mov [edi+12],edx
-        add esi,BYTE 16
-        add edi,BYTE 16
-        dec ebp
-        jnz .L4                 
-    ; check tail
-    pop ecx
-    and ecx,BYTE 11b
-    jz .L6
-.L5 ; tail loop
-    mov edx,[esi]
-    bswap edx
-    mov [edi],edx
-    add esi,BYTE 4
-    add edi,BYTE 4
-    dec ecx
-    jnz .L5
-.L6 pop ebp
-    retn
-;; 32 bit RGB 888 to 24 BIT RGB 888
-_ConvertX86p32_24RGB888:
-	; check short
-	cmp ecx,BYTE 32
-	ja .L3
-.L1	; short loop
-	mov al,[esi]
-	mov bl,[esi+1]
-	mov dl,[esi+2]
-	mov [edi],al
-	mov [edi+1],bl
-	mov [edi+2],dl
-	add esi,BYTE 4
-	add edi,BYTE 3
-	dec ecx
-	jnz .L1
-.L2 
-	retn
-.L3	;	 head
-	mov edx,edi
-	and edx,BYTE 11b
-	jz .L4
-	mov al,[esi]
-	mov bl,[esi+1]
-	mov dl,[esi+2]
-	mov [edi],al
-	mov [edi+1],bl
-	mov [edi+2],dl
-	add esi,BYTE 4
-	add edi,BYTE 3
-	dec ecx
-	jmp SHORT .L3
-.L4 ; unroll 4 times
-	push ebp
-	mov ebp,ecx
-	shr ebp,2
-    ; save count
-	push ecx
-.L5     mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
-        mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]
-        shl eax,8                       ;                        eax = [R][G][B][.]
-        mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]
-        shl ebx,8                       ;                        ebx = [r][g][b][.]
-        mov al,[esi+4]                  ;                        eax = [R][G][B][b]
-        ror eax,8                       ;                        eax = [b][R][G][B] (done)
-        mov bh,[esi+8+1]                ;                        ebx = [r][g][G][.]
-        mov [edi],eax
-        add edi,BYTE 3*4
-        shl ecx,8                       ;                        ecx = [r][g][b][.]
-        mov bl,[esi+8+0]                ;                        ebx = [r][g][G][B]
-        rol ebx,16                      ;                        ebx = [G][B][r][g] (done)
-        mov cl,[esi+8+2]                ;                        ecx = [r][g][b][R] (done)
-        mov [edi+4-3*4],ebx
-        add esi,BYTE 4*4
-        mov [edi+8-3*4],ecx
-        dec ebp
-        jnz .L5
-    ; check tail
-	pop ecx
-	and ecx,BYTE 11b
-	jz .L7
-.L6 ; tail loop
-	mov al,[esi]
-	mov bl,[esi+1]
-	mov dl,[esi+2]
-	mov [edi],al
-	mov [edi+1],bl
-	mov [edi+2],dl
-	add esi,BYTE 4
-	add edi,BYTE 3
-	dec ecx
-	jnz .L6
-.L7	pop ebp
-	retn
-;; 32 bit RGB 888 to 24 bit BGR 888
-_ConvertX86p32_24BGR888:
-	; check short
-	cmp ecx,BYTE 32
-	ja .L3
-.L1	; short loop
-	mov dl,[esi]
-	mov bl,[esi+1]
-	mov al,[esi+2]
-	mov [edi],al
-	mov [edi+1],bl
-	mov [edi+2],dl
-	add esi,BYTE 4
-	add edi,BYTE 3
-	dec ecx
-	jnz .L1
-.L2
-	retn
-.L3 ; head
-	mov edx,edi
-	and edx,BYTE 11b
-	jz .L4
-	mov dl,[esi]
-	mov bl,[esi+1]
-	mov al,[esi+2]
-	mov [edi],al
-	mov [edi+1],bl
-	mov [edi+2],dl
-	add esi,BYTE 4
-	add edi,BYTE 3
-	dec ecx
-	jmp SHORT .L3
-.L4	; unroll 4 times
-	push ebp
-	mov ebp,ecx
-	shr ebp,2
-	; save count
-	push ecx
-.L5     
-	mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
-        mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]
-        bswap eax                       ;                        eax = [B][G][R][A]
-        bswap ebx                       ;                        ebx = [b][g][r][a]
-        mov al,[esi+4+2]                ;                        eax = [B][G][R][r] 
-        mov bh,[esi+4+4+1]              ;                        ebx = [b][g][G][a]
-        ror eax,8                       ;                        eax = [r][B][G][R] (done)
-        mov bl,[esi+4+4+2]              ;                        ebx = [b][g][G][R]
-        ror ebx,16                      ;                        ebx = [G][R][b][g] (done)
-        mov [edi],eax
-        mov [edi+4],ebx
-        mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]
-        bswap ecx                       ;                        ecx = [b][g][r][a]
-        mov cl,[esi+8]                  ;                        ecx = [b][g][r][B] (done)
-        add esi,BYTE 4*4
-        mov [edi+8],ecx
-        add edi,BYTE 3*4
-        dec ebp
-        jnz .L5
-	; check tail
-	pop ecx
-	and ecx,BYTE 11b
-	jz .L7
-.L6	; tail loop
-	mov dl,[esi]
-	mov bl,[esi+1]
-	mov al,[esi+2]
-	mov [edi],al
-	mov [edi+1],bl
-	mov [edi+2],dl
-	add esi,BYTE 4
-	add edi,BYTE 3
-	dec ecx
-	jnz .L6
-.L7 
-	pop ebp
-	retn
-;; 32 bit RGB 888 to 16 BIT RGB 565 
-_ConvertX86p32_16RGB565:
-	; check short
-	cmp ecx,BYTE 16
-	ja .L3
-.L1 ; short loop
-	mov bl,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov ah,[esi+2]    ; red
-	shr ah,3
-        and al,11111100b
-	shl eax,3
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-	dec ecx
-	jnz .L1
-.L2:				; End of short loop
-	retn
-.L3	; head
-	mov ebx,edi
-	and ebx,BYTE 11b
-	jz .L4
-	mov bl,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov ah,[esi+2]    ; red
-	shr ah,3
-	and al,11111100b
-	shl eax,3
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-	dec ecx
-.L4:	 
-    ; save count
-	push ecx
-    ; unroll twice
-	shr ecx,1
-    ; point arrays to end
-	lea esi,[esi+ecx*8]
-	lea edi,[edi+ecx*4]
-    ; negative counter 
-	neg ecx
-	jmp SHORT .L6
-.L5:	    
-	mov [edi+ecx*4-4],eax
-.L6:	
-	mov eax,[esi+ecx*8]
-        shr ah,2
-        mov ebx,[esi+ecx*8+4]
-        shr eax,3
-        mov edx,[esi+ecx*8+4]
-        shr bh,2
-        mov dl,[esi+ecx*8+2]
-        shl ebx,13
-        and eax,000007FFh
-        shl edx,8
-        and ebx,07FF0000h
-        and edx,0F800F800h
-        add eax,ebx
-        add eax,edx
-        inc ecx
-        jnz .L5                 
-	mov [edi+ecx*4-4],eax
-    ; tail
-	pop ecx
-	test cl,1
-	jz .L7
-	mov bl,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov ah,[esi+2]    ; red
-	shr ah,3
-	and al,11111100b
-	shl eax,3
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-.L7:	
-	retn
-;; 32 bit RGB 888 to 16 BIT BGR 565 
-_ConvertX86p32_16BGR565:
-	; check short
-	cmp ecx,BYTE 16
-	ja .L3
-.L1	; short loop
-	mov ah,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov bl,[esi+2]    ; red
-	shr ah,3
-	and al,11111100b
-	shl eax,3
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-	dec ecx
-	jnz .L1
-.L2
-	retn
-.L3	; head
-	mov ebx,edi
-	and ebx,BYTE 11b
-	jz .L4   
-	mov ah,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov bl,[esi+2]    ; red
-	shr ah,3
-	and al,11111100b
-	shl eax,3
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-	dec ecx
-.L4	; save count
-	push ecx
-	; unroll twice
-	shr ecx,1
-	; point arrays to end
-	lea esi,[esi+ecx*8]
-	lea edi,[edi+ecx*4]
-	; negative count
-	neg ecx
-	jmp SHORT .L6
-.L5     
-	mov [edi+ecx*4-4],eax            
-.L6     
-	mov edx,[esi+ecx*8+4]
-        mov bh,[esi+ecx*8+4]                       
-        mov ah,[esi+ecx*8]                       
-        shr bh,3
-        mov al,[esi+ecx*8+1]             
-        shr ah,3
-        mov bl,[esi+ecx*8+5]           
-        shl eax,3
-        mov dl,[esi+ecx*8+2]
-        shl ebx,19
-        and eax,0000FFE0h              
-        shr edx,3
-        and ebx,0FFE00000h             
-        and edx,001F001Fh               
-        add eax,ebx
-        add eax,edx
-        inc ecx
-        jnz .L5                 
-	mov [edi+ecx*4-4],eax            
-	; tail
-	pop ecx
-	and ecx,BYTE 1
-	jz .L7
-	mov ah,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov bl,[esi+2]    ; red
-	shr ah,3
-	and al,11111100b
-	shl eax,3
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-.L7 
-	retn
-;; 32 BIT RGB TO 16 BIT RGB 555
-_ConvertX86p32_16RGB555:
-	; check short
-	cmp ecx,BYTE 16
-	ja .L3
-.L1	; short loop
-	mov bl,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov ah,[esi+2]    ; red
-	shr ah,3
-	and al,11111000b
-	shl eax,2
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-	dec ecx
-	jnz .L1
-.L2
-	retn
-.L3	; head
-	mov ebx,edi
-        and ebx,BYTE 11b
-	jz .L4   
-	mov bl,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov ah,[esi+2]    ; red
-	shr ah,3
-	and al,11111000b
-	shl eax,2
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-	dec ecx
-.L4	; save count
-	push ecx
-	; unroll twice
-	shr ecx,1
-	; point arrays to end
-	lea esi,[esi+ecx*8]
-	lea edi,[edi+ecx*4]
-	; negative counter 
-	neg ecx
-	jmp SHORT .L6
-.L5     
-	mov [edi+ecx*4-4],eax
-.L6     
-	mov eax,[esi+ecx*8]
-        shr ah,3
-        mov ebx,[esi+ecx*8+4]
-        shr eax,3
-        mov edx,[esi+ecx*8+4]
-        shr bh,3
-        mov dl,[esi+ecx*8+2]
-        shl ebx,13
-        and eax,000007FFh
-        shl edx,7
-        and ebx,07FF0000h
-        and edx,07C007C00h
-        add eax,ebx
-        add eax,edx
-        inc ecx
-        jnz .L5                 
-	mov [edi+ecx*4-4],eax
-	; tail
-	pop ecx
-	and ecx,BYTE 1
-	jz .L7
-	mov bl,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov ah,[esi+2]    ; red
-	shr ah,3
-	and al,11111000b
-	shl eax,2
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-.L7
-	retn
-;; 32 BIT RGB TO 16 BIT BGR 555
-_ConvertX86p32_16BGR555:
-	; check short
-	cmp ecx,BYTE 16
-	ja .L3
-.L1	; short loop
-	mov ah,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov bl,[esi+2]    ; red
-	shr ah,3
-	and al,11111000b
-	shl eax,2
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-	dec ecx
-	jnz .L1
-.L2 
-	retn
-.L3	; head
-	mov ebx,edi
-        and ebx,BYTE 11b
-	jz .L4   
-	mov ah,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov bl,[esi+2]    ; red
-	shr ah,3
-	and al,11111000b
-	shl eax,2
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-	dec ecx
-.L4	; save count
-	push ecx
-	; unroll twice
-	shr ecx,1
-	; point arrays to end
-	lea esi,[esi+ecx*8]
-	lea edi,[edi+ecx*4]
-	; negative counter 
-	neg ecx
-	jmp SHORT .L6
-.L5     
-	mov [edi+ecx*4-4],eax            
-.L6     
-	mov edx,[esi+ecx*8+4]
-        mov bh,[esi+ecx*8+4]                       
-        mov ah,[esi+ecx*8]                       
-        shr bh,3
-        mov al,[esi+ecx*8+1]             
-        shr ah,3
-        mov bl,[esi+ecx*8+5]           
-        shl eax,2
-        mov dl,[esi+ecx*8+2]
-        shl ebx,18
-        and eax,00007FE0h              
-        shr edx,3
-        and ebx,07FE00000h             
-        and edx,001F001Fh               
-        add eax,ebx
-        add eax,edx
-        inc ecx
-        jnz .L5                 
-	mov [edi+ecx*4-4],eax            
-	; tail
-	pop ecx
-	and ecx,BYTE 1
-	jz .L7
-	mov ah,[esi+0]    ; blue
-	mov al,[esi+1]    ; green
-	mov bl,[esi+2]    ; red
-	shr ah,3
-	and al,11111000b
-	shl eax,2
-	shr bl,3
-	add al,bl
-	mov [edi+0],al
-	mov [edi+1],ah
-	add esi,BYTE 4
-	add edi,BYTE 2
-.L7
-	retn
-;; FROM 32 BIT RGB to 8 BIT RGB (rrrgggbbb)
-;; This routine writes FOUR pixels at once (dword) and then, if they exist
-;; the trailing three pixels
-_ConvertX86p32_8RGB332:
-.L_ALIGNED
-	push ecx
-	shr ecx,2		; We will draw 4 pixels at once
-	jnz .L1
-	jmp .L2			; short jump out of range :(
-.L1:
-	mov eax,[esi]		; first pair of pixels
-	mov edx,[esi+4]
-	shr dl,6
-	mov ebx,eax
-	shr al,6
-	and ah,0e0h
-	shr ebx,16
-	and dh,0e0h
-	shr ah,3
-	and bl,0e0h
-	shr dh,3
-	or al,bl
-	mov ebx,edx	
-	or al,ah
-	shr ebx,16
-	or dl,dh
-	and bl,0e0h
-	or dl,bl
-	mov ah,dl
-	mov ebx,[esi+8]		; second pair of pixels
-	mov edx,ebx
-	and bh,0e0h
-	shr bl,6
-	and edx,0e00000h
-	shr edx,16
-	shr bh,3
-	ror eax,16
-	or bl,dl
-	mov edx,[esi+12]
-	or bl,bh
-	mov al,bl
-	mov ebx,edx
-	and dh,0e0h
-	shr dl,6
-	and ebx,0e00000h
-	shr dh,3
-	mov ah,dl
-	shr ebx,16
-	or ah,dh
-	or ah,bl
-	rol eax,16
-	add esi,BYTE 16
-	mov [edi],eax	
-	add edi,BYTE 4
-	dec ecx
-	jz .L2			; L1 out of range for short jump :(
-	jmp .L1
-.L2:
-	pop ecx
-	and ecx,BYTE 3		; mask out number of pixels to draw
-	jz .L4			; Nothing to do anymore
-.L3:
-	mov eax,[esi]		; single pixel conversion for trailing pixels
-        mov ebx,eax
-        shr al,6
-        and ah,0e0h
-        shr ebx,16
-        shr ah,3
-        and bl,0e0h
-        or al,ah
-        or al,bl
-        mov [edi],al
-        inc edi
-        add esi,BYTE 4
-	dec ecx
-	jnz .L3
-.L4:	
-	retn
-%ifidn __OUTPUT_FORMAT__,elf
-section .note.GNU-stack noalloc noexec nowrite progbits
-%endif
--- a/src/video/SDL_blit.c
+++ b/src/video/SDL_blit.c
@@ -24,6 +24,7 @@
 #include "SDL_video.h"
 #include "SDL_sysvideo.h"
 #include "SDL_blit.h"
+#include "SDL_blit_copy.h"
 #include "SDL_RLEaccel_c.h"
 #include "SDL_pixels_c.h"
@@ -106,111 +107,64 @@ SDL_SoftBlit(SDL_Surface * src, SDL_Rect * srcrect,
    return (okay ? 0 : -1);
 }
-#ifdef MMX_ASMBLIT
+#ifdef __MACOSX__
-static __inline__ void
+#include <sys/sysctl.h>
-SDL_memcpyMMX(Uint8 * to, const Uint8 * from, int len)
+static SDL_bool SDL_UseAltivecPrefetch()
 {
-    int i;
+    const char key[] = "hw.l3cachesize";
+    u_int64_t result = 0;
+    size_t typeSize = sizeof(result);
-    for (i = 0; i < len / 8; i++) {
+    if (sysctlbyname(key, &result, &typeSize, NULL, 0) == 0 && result > 0) {
-        __asm__ __volatile__("	movq (%0), %%mm0\n"
+        return SDL_TRUE;
-                             "	movq %%mm0, (%1)\n"::"r"(from),
+    } else {
-                             "r"(to):"memory");
+        return SDL_FALSE;
-        from += 8;
-        to += 8;
    }
-    if (len & 7)
-        SDL_memcpy(to, from, len & 7);
 }
+#else
+static SDL_bool SDL_UseAltivecPrefetch()
+{
+    /* Just guess G4 */
+    return SDL_TRUE;
+}
+#endif /* __MACOSX__ */
-static __inline__ void
+static SDL_loblit SDL_ChooseBlitFunc(SDL_BlitEntry *entries, int count)
-SDL_memcpySSE(Uint8 * to, const Uint8 * from, int len)
 {
    int i;
+    static Uint32 features = 0xffffffff;
-    __asm__ __volatile__("	prefetchnta (%0)\n"
+    if (features == 0xffffffff) {
-                         "	prefetchnta 64(%0)\n"
+        features = SDL_BLIT_ANY;
-                         "	prefetchnta 128(%0)\n"
-                         "	prefetchnta 192(%0)\n"::"r"(from));
-    for (i = 0; i < len / 8; i++) {
+        /* Provide an override for testing .. */
-        __asm__ __volatile__("	prefetchnta 256(%0)\n"
+        const char *override = SDL_getenv("SDL_BLIT_FEATURES");
-                             "	movq (%0), %%mm0\n"
+        if (override) {
-                             "	movntq %%mm0, (%1)\n"::"r"(from),
+            SDL_sscanf(override, "%u", &features);
-                             "r"(to):"memory");
+        } else {
-        from += 8;
+            if (SDL_HasMMX()) {
-        to += 8;
+                features |= SDL_BLIT_MMX;
            }
-    if (len & 7)
-        SDL_memcpy(to, from, len & 7);
-}
-#endif
-static void
-SDL_BlitCopy(SDL_BlitInfo * info)
-{
-    Uint8 *src, *dst;
-    int w, h;
-    int srcskip, dstskip;
-    w = info->d_width * info->dst->BytesPerPixel;
-    h = info->d_height;
-    src = info->s_pixels;
-    dst = info->d_pixels;
-    srcskip = w + info->s_skip;
-    dstskip = w + info->d_skip;
-#ifdef MMX_ASMBLIT
            if (SDL_HasSSE()) {
-        while (h--) {
+                features |= SDL_BLIT_SSE;
-            SDL_memcpySSE(dst, src, w);
+            }
-            src += srcskip;
+            if (SDL_HasAltivec()) {
-            dst += dstskip;
+                if (SDL_UseAltivecPrefetch()) {
+                    features |= SDL_BLIT_ALTIVEC_PREFETCH;
+                } else {
+                    features |= SDL_BLIT_ALTIVEC_NOPREFETCH;
                }
-        __asm__ __volatile__("	emms\n"::);
-    } else if (SDL_HasMMX()) {
-        while (h--) {
-            SDL_memcpyMMX(dst, src, w);
-            src += srcskip;
-            dst += dstskip;
            }
-        __asm__ __volatile__("	emms\n"::);
-    } else
-#endif
-        while (h--) {
-            SDL_memcpy(dst, src, w);
-            src += srcskip;
-            dst += dstskip;
        }
-}
-static void
-SDL_BlitCopyOverlap(SDL_BlitInfo * info)
-{
-    Uint8 *src, *dst;
-    int w, h;
-    int srcskip, dstskip;
-    w = info->d_width * info->dst->BytesPerPixel;
-    h = info->d_height;
-    src = info->s_pixels;
-    dst = info->d_pixels;
-    srcskip = w + info->s_skip;
-    dstskip = w + info->d_skip;
-    if (dst < src) {
-        while (h--) {
-            SDL_memcpy(dst, src, w);
-            src += srcskip;
-            dst += dstskip;
    }
-    } else {
-        src += ((h - 1) * srcskip);
+    for (i = count; i > 0; --i) {
-        dst += ((h - 1) * dstskip);
+        if (features & entries[i].features) {
-        while (h--) {
+            return entries[i].blit;
-            SDL_revcpy(dst, src, w);
-            src -= srcskip;
-            dst -= dstskip;
        }
    }
+    return entries[0].blit;
 }
 /* Figure out which of many blit routines to set up on a surface */
@@ -237,11 +191,11 @@ SDL_CalculateBlit(SDL_Surface * surface)
    /* Check for special "identity" case -- copy blit */
    if (surface->map->identity && blit_index == 0) {
-        surface->map->sw_data->blit = SDL_BlitCopy;
        /* Handle overlapping blits on the same surface */
        if (surface == surface->map->dst) {
            surface->map->sw_data->blit = SDL_BlitCopyOverlap;
+        } else {
+            surface->map->sw_data->blit = SDL_BlitCopy;
        }
    } else {
        if (surface->format->BitsPerPixel < 8) {

--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -67,6 +67,17 @@ typedef struct SDL_BlitMap
    unsigned int format_version;
 } SDL_BlitMap;
+#define SDL_BLIT_ANY                0x00000000
+#define SDL_BLIT_MMX                0x00000001
+#define SDL_BLIT_SSE                0x00000002
+#define SDL_BLIT_ALTIVEC_PREFETCH   0x00000004
+#define SDL_BLIT_ALTIVEC_NOPREFETCH 0x00000008
+typedef struct SDL_BlitEntry
+{
+    Uint32 features;
+    SDL_loblit blit;
+} SDL_BlitEntry;
 /* Functions found in SDL_blit.c */
 extern int SDL_CalculateBlit(SDL_Surface * surface);

--- a/src/video/SDL_blit_N.c
+++ b/src/video/SDL_blit_N.c
@@ -879,19 +879,6 @@ GetBlitFeatures(void)
 #define LO	1
 #endif
-#if SDL_HERMES_BLITTERS
-/* Heheheh, we coerce Hermes into using SDL blit information */
-#define X86_ASSEMBLER
-#define HermesConverterInterface	SDL_BlitInfo
-#define HermesClearInterface		void
-#define STACKCALL
-#include "../hermes/HeadMMX.h"
-#include "../hermes/HeadX86.h"
-#else
 /* Special optimized blit for RGB 8-8-8 --> RGB 3-3-2 */
 #define RGB888_RGB332(dst, src) { \
 	dst = (Uint8)((((src)&0x00E00000)>>16)| \
@@ -1250,8 +1237,6 @@ Blit_RGB888_RGB565(SDL_BlitInfo * info)
 #endif /* USE_DUFFS_LOOP */
 }
-#endif /* SDL_HERMES_BLITTERS */
 /* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */
 #define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1])
@@ -2357,17 +2342,7 @@ static const struct blit_table normal_blit_1[] = {
    {0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL},
 };
 static const struct blit_table normal_blit_2[] = {
-#if SDL_HERMES_BLITTERS
+#if SDL_ALTIVEC_BLITTERS
-    {0x0000F800, 0x000007E0, 0x0000001F, 2, 0x0000001F, 0x000007E0,
-     0x0000F800,
-     0, ConvertX86p16_16BGR565, ConvertX86, NO_ALPHA},
-    {0x0000F800, 0x000007E0, 0x0000001F, 2, 0x00007C00, 0x000003E0,
-     0x0000001F,
-     0, ConvertX86p16_16RGB555, ConvertX86, NO_ALPHA},
-    {0x0000F800, 0x000007E0, 0x0000001F, 2, 0x0000001F, 0x000003E0,
-     0x00007C00,
-     0, ConvertX86p16_16BGR555, ConvertX86, NO_ALPHA},
-#elif SDL_ALTIVEC_BLITTERS
    /* has-altivec */
    {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00000000, 0x00000000,
     0x00000000,
@@ -2397,47 +2372,6 @@ static const struct blit_table normal_blit_3[] = {
    {0, 0, 0, 0, 0, 0, 0, 0, NULL, BlitNtoN, 0}
 };
 static const struct blit_table normal_blit_4[] = {
-#if SDL_HERMES_BLITTERS
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0,
-     0x0000001F,
-     1, ConvertMMXpII32_16RGB565, ConvertMMX, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0,
-     0x0000001F,
-     0, ConvertX86p32_16RGB565, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000007E0,
-     0x0000F800,
-     1, ConvertMMXpII32_16BGR565, ConvertMMX, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000007E0,
-     0x0000F800,
-     0, ConvertX86p32_16BGR565, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
-     0x0000001F,
-     1, ConvertMMXpII32_16RGB555, ConvertMMX, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
-     0x0000001F,
-     0, ConvertX86p32_16RGB555, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000003E0,
-     0x00007C00,
-     1, ConvertMMXpII32_16BGR555, ConvertMMX, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000003E0,
-     0x00007C00,
-     0, ConvertX86p32_16BGR555, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 3, 0x00FF0000, 0x0000FF00,
-     0x000000FF,
-     0, ConvertX86p32_24RGB888, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 3, 0x000000FF, 0x0000FF00,
-     0x00FF0000,
-     0, ConvertX86p32_24BGR888, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0x000000FF, 0x0000FF00,
-     0x00FF0000,
-     0, ConvertX86p32_32BGR888, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0xFF000000, 0x00FF0000,
-     0x0000FF00,
-     0, ConvertX86p32_32RGBA888, ConvertX86, NO_ALPHA},
-    {0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0x0000FF00, 0x00FF0000,
-     0xFF000000,
-     0, ConvertX86p32_32BGRA888, ConvertX86, NO_ALPHA},
-#else
 #if SDL_ALTIVEC_BLITTERS
    /* has-altivec | dont-use-prefetch */
    {0x00000000, 0x00000000, 0x00000000, 4, 0x00000000, 0x00000000,
@@ -2460,7 +2394,6 @@ static const struct blit_table normal_blit_4[] = {
    {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
     0x0000001F,
     0, NULL, Blit_RGB888_RGB555, NO_ALPHA},
-#endif
    /* Default for 32-bit RGB source, used if no other blitter matches */
    {0, 0, 0, 0, 0, 0, 0, 0, NULL, BlitNtoN, 0}
 };
@@ -2529,12 +2462,7 @@ SDL_CalculateBlitN(SDL_Surface * surface, int blit_index)
            if (surface->map->table) {
                blitfun = Blit_RGB888_index8_map;
            } else {
-#if SDL_HERMES_BLITTERS
-                sdata->aux_data = ConvertX86p32_8RGB332;
-                blitfun = ConvertX86;
-#else
                blitfun = Blit_RGB888_index8;
-#endif
            }
        } else {
            blitfun = BlitNto1;
@@ -2575,13 +2503,6 @@ SDL_CalculateBlitN(SDL_Surface * surface, int blit_index)
    }
 #ifdef DEBUG_ASM
-#if SDL_HERMES_BLITTERS
-    if (blitfun == ConvertMMX)
-        fprintf(stderr, "Using mmx blit\n");
-    else if (blitfun == ConvertX86)
-        fprintf(stderr, "Using asm blit\n");
-    else
-#endif
    if ((blitfun == BlitNtoN) || (blitfun == BlitNto1))
        fprintf(stderr, "Using C blit\n");
    else

--- a/src/video/SDL_blit_copy.c
+++ b/src/video/SDL_blit_copy.c
+/*
+    SDL - Simple DirectMedia Layer
+    Copyright (C) 1997-2006 Sam Lantinga
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+    Sam Lantinga
+    slouken@libsdl.org
+*/
+#include "SDL_config.h"
+#include "SDL_video.h"
+#include "SDL_blit.h"
+/* The MMX/SSE intrinsics don't give access to specific registers for
+   the most memory parallelism, so we'll use GCC inline assembly here...
+*/
+#ifndef __GNUC__
+#undef __MMX__
+#undef __SSE__
+#endif
+#ifdef __MMX__
+static __inline__ void
+SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
+{
+    int i;
+    for (i = len / 64; i--;) {
+        __asm__ __volatile__ (
+        "prefetchnta (%0)\n"
+        "movq (%0), %%mm0\n"
+        "movq 8(%0), %%mm1\n"
+        "movq 16(%0), %%mm2\n"
+        "movq 24(%0), %%mm3\n"
+        "movq 32(%0), %%mm4\n"
+        "movq 40(%0), %%mm5\n"
+        "movq 48(%0), %%mm6\n"
+        "movq 56(%0), %%mm7\n"
+        "movntq %%mm0, (%1)\n"
+        "movntq %%mm1, 8(%1)\n"
+        "movntq %%mm2, 16(%1)\n"
+        "movntq %%mm3, 24(%1)\n"
+        "movntq %%mm4, 32(%1)\n"
+        "movntq %%mm5, 40(%1)\n"
+        "movntq %%mm6, 48(%1)\n"
+        "movntq %%mm7, 56(%1)\n"
+        :: "r" (src), "r" (dst) : "memory");
+        src += 64;
+        dst += 64;
+    }
+    if (len & 63)
+        SDL_memcpy(dst, src, len & 63);
+}
+#endif /* __MMX__ */
+#ifdef __SSE__
+static __inline__ void
+SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
+{
+    int i;
+    for (i = len / 64; i--;) {
+        __asm__ __volatile__ (
+        "prefetchnta (%0)\n"
+        "movaps (%0), %%xmm0\n"
+        "movaps 16(%0), %%xmm1\n"
+        "movaps 32(%0), %%xmm2\n"
+        "movaps 48(%0), %%xmm3\n"
+        "movntps %%xmm0, (%1)\n"
+        "movntps %%xmm1, 16(%1)\n"
+        "movntps %%xmm2, 32(%1)\n"
+        "movntps %%xmm3, 48(%1)\n"
+        :: "r" (src), "r" (dst) : "memory");
+        src += 64;
+        dst += 64;
+    }
+    if (len & 63)
+        SDL_memcpy(dst, src, len & 63);
+}
+#endif /* __SSE__ */
+void
+SDL_BlitCopy(SDL_BlitInfo * info)
+{
+    Uint8 *src, *dst;
+    int w, h;
+    int srcskip, dstskip;
+    w = info->d_width * info->dst->BytesPerPixel;
+    h = info->d_height;
+    src = info->s_pixels;
+    dst = info->d_pixels;
+    srcskip = w + info->s_skip;
+    dstskip = w + info->d_skip;
+#ifdef __SSE__
+    if (SDL_HasSSE() && !((uintptr_t)src & 15) && !((uintptr_t)dst & 15)) {
+        while (h--) {
+            SDL_memcpySSE(dst, src, w);
+            src += srcskip;
+            dst += dstskip;
+        }
+        return;
+    }
+#endif
+#ifdef __MMX__
+    if (SDL_HasMMX() && !((uintptr_t)src & 7) && !((uintptr_t)dst & 7)) {
+        while (h--) {
+            SDL_memcpyMMX(dst, src, w);
+            src += srcskip;
+            dst += dstskip;
+        }
+        __asm__ __volatile__("	emms\n"::);
+        return;
+    }
+#endif
+    while (h--) {
+        SDL_memcpy(dst, src, w);
+        src += srcskip;
+        dst += dstskip;
+    }
+}
+void
+SDL_BlitCopyOverlap(SDL_BlitInfo * info)
+{
+    Uint8 *src, *dst;
+    int w, h;
+    int skip;
+    w = info->d_width * info->dst->BytesPerPixel;
+    h = info->d_height;
+    src = info->s_pixels;
+    dst = info->d_pixels;
+    skip = w + info->s_skip;
+    if ((dst < src) || (dst >= (src + h*skip))) {
+        SDL_BlitCopy(info);
+    } else {
+        src += ((h - 1) * skip);
+        dst += ((h - 1) * skip);
+        while (h--) {
+            SDL_revcpy(dst, src, w);
+            src -= skip;
+            dst -= skip;
+        }
+    }
+}
+/* vi: set ts=4 sw=4 expandtab: */
--- a/src/video/SDL_blit_copy.h
+++ b/src/video/SDL_blit_copy.h
+/*
+    SDL - Simple DirectMedia Layer
+    Copyright (C) 1997-2006 Sam Lantinga
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+    Sam Lantinga
+    slouken@libsdl.org
+*/
+void SDL_BlitCopy(SDL_BlitInfo * info);
+void SDL_BlitCopyOverlap(SDL_BlitInfo * info);
+/* vi: set ts=4 sw=4 expandtab: */