Commit 37fe3a93 authored by Sam Lantinga's avatar Sam Lantinga

Removed hermes since it's LGPL and not compatible with a commercial license.

Prepping for using MMX and SSE intrinsics instead of inline assembly.
.. except for memcpy equivalents which only get faster if they can
   exploit the parallelism of loading into multiple SIMD registers. :)

--HG--
extra : convert_revision : svn%3Ac70aab31-4412-0410-b14c-859654838e24/trunk%402609
parent 92c5ea48
......@@ -65,12 +65,6 @@ __EOF__
\$(LIBTOOL) --mode=compile \$(CC) \$(CFLAGS) \$(EXTRA_CFLAGS) -c $src -o \$@
__EOF__
;;
asm) cat >>${output}.new <<__EOF__
\$(LIBTOOL) --tag=CC --mode=compile \$(auxdir)/strip_fPIC.sh \$(NASM) $src -o \$@
__EOF__
;;
S) cat >>${output}.new <<__EOF__
......
This diff is collapsed.
......@@ -292,7 +292,6 @@
/* Enable assembly routines */
#undef SDL_ASSEMBLY_ROUTINES
#undef SDL_HERMES_BLITTERS
#undef SDL_ALTIVEC_BLITTERS
#endif /* _SDL_config_h */
This diff is collapsed.
/*
Header definitions for the MMX routines for the HERMES library
Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
This source code is licensed under the GNU LGPL
Please refer to the file COPYING.LIB contained in the distribution for
licensing conditions
*/
#include "SDL_config.h"
#ifndef __HERMES_HEAD_MMX__
#define __HERMES_HEAD_MMX__
/* If you cannot stand ifdefs, then please do not look into this file, it's
going to end your life :) */
#ifdef X86_ASSEMBLER
#ifdef __cplusplus
extern "C"
{
#endif
void STACKCALL ConvertMMX(HermesConverterInterface *);
void STACKCALL ClearMMX_32(HermesClearInterface *);
void STACKCALL ClearMMX_24(HermesClearInterface *);
void STACKCALL ClearMMX_16(HermesClearInterface *);
void STACKCALL ClearMMX_8(HermesClearInterface *);
void ConvertMMXpII32_24RGB888();
void ConvertMMXpII32_16RGB565();
void ConvertMMXpII32_16BGR565();
void ConvertMMXpII32_16RGB555();
void ConvertMMXpII32_16BGR565();
void ConvertMMXpII32_16BGR555();
void ConvertMMXp32_16RGB555();
#ifdef __cplusplus
}
#endif
/* Fix the underscore business with ELF compilers */
#if defined(__ELF__) && defined(__GNUC__)
#ifdef __cplusplus
extern "C"
{
#endif
extern void _ConvertMMX(HermesConverterInterface *);
extern void _ConvertMMXpII32_24RGB888();
extern void _ConvertMMXpII32_16RGB565();
extern void _ConvertMMXpII32_16BGR565();
extern void _ConvertMMXpII32_16RGB555();
extern void _ConvertMMXpII32_16BGR555();
#define ConvertMMX _ConvertMMX
#define ConvertMMXpII32_24RGB888 _ConvertMMXpII32_24RGB888
#define ConvertMMXpII32_16RGB565 _ConvertMMXpII32_16RGB565
#define ConvertMMXpII32_16BGR565 _ConvertMMXpII32_16BGR565
#define ConvertMMXpII32_16RGB555 _ConvertMMXpII32_16RGB555
#define ConvertMMXpII32_16BGR555 _ConvertMMXpII32_16BGR555
#ifdef __cplusplus
}
#endif
#endif /* ELF and GNUC */
/* Make it work with Watcom */
#ifdef __WATCOMC__
#pragma warning 601 9
#pragma aux ConvertMMX "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearMMX_32 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearMMX_24 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearMMX_16 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearMMX_8 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ConvertMMXpII32_24RGB888 "_*"
#pragma aux ConvertMMXpII32_16RGB565 "_*"
#pragma aux ConvertMMXpII32_16BGR565 "_*"
#pragma aux ConvertMMXpII32_16RGB555 "_*"
#pragma aux ConvertMMXpII32_16BGR555 "_*"
#pragma aux ConvertMMXp32_16RGB555 "_*"
#endif /* WATCOM */
#endif /* X86_ASSEMBLER */
#endif
/* vi: set ts=4 sw=4 expandtab: */
/*
Header definitions for the x86 routines for the HERMES library
Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
This source code is licensed under the GNU LGPL
Please refer to the file COPYING.LIB contained in the distribution for
licensing conditions
*/
#ifndef __HERMES_HEAD_X86__
#define __HERMES_HEAD_X86__
#ifdef X86_ASSEMBLER
/* If you can't stand IFDEFS, then close your eyes now, please :) */
/* Ok, we start with normal function definitions */
#ifdef __cplusplus
extern "C"
{
#endif
void STACKCALL ConvertX86(HermesConverterInterface *);
void STACKCALL ClearX86_32(HermesClearInterface *);
void STACKCALL ClearX86_24(HermesClearInterface *);
void STACKCALL ClearX86_16(HermesClearInterface *);
void STACKCALL ClearX86_8(HermesClearInterface *);
int STACKCALL Hermes_X86_CPU();
void ConvertX86p32_32BGR888();
void ConvertX86p32_32RGBA888();
void ConvertX86p32_32BGRA888();
void ConvertX86p32_24RGB888();
void ConvertX86p32_24BGR888();
void ConvertX86p32_16RGB565();
void ConvertX86p32_16BGR565();
void ConvertX86p32_16RGB555();
void ConvertX86p32_16BGR555();
void ConvertX86p32_8RGB332();
void ConvertX86p16_32RGB888();
void ConvertX86p16_32BGR888();
void ConvertX86p16_32RGBA888();
void ConvertX86p16_32BGRA888();
void ConvertX86p16_24RGB888();
void ConvertX86p16_24BGR888();
void ConvertX86p16_16BGR565();
void ConvertX86p16_16RGB555();
void ConvertX86p16_16BGR555();
void ConvertX86p16_8RGB332();
void CopyX86p_4byte();
void CopyX86p_3byte();
void CopyX86p_2byte();
void CopyX86p_1byte();
void ConvertX86pI8_32();
void ConvertX86pI8_24();
void ConvertX86pI8_16();
extern int ConvertX86p16_32RGB888_LUT_X86[512];
extern int ConvertX86p16_32BGR888_LUT_X86[512];
extern int ConvertX86p16_32RGBA888_LUT_X86[512];
extern int ConvertX86p16_32BGRA888_LUT_X86[512];
#ifdef __cplusplus
}
#endif
/* Now fix up the ELF underscore problem */
#if defined(__ELF__) && defined(__GNUC__)
#ifdef __cplusplus
extern "C"
{
#endif
extern int _Hermes_X86_CPU();
extern void _ConvertX86(HermesConverterInterface *);
extern void _ConvertX86p32_32BGR888();
extern void _ConvertX86p32_32RGBA888();
extern void _ConvertX86p32_32BGRA888();
extern void _ConvertX86p32_24RGB888();
extern void _ConvertX86p32_24BGR888();
extern void _ConvertX86p32_16RGB565();
extern void _ConvertX86p32_16BGR565();
extern void _ConvertX86p32_16RGB555();
extern void _ConvertX86p32_16BGR555();
extern void _ConvertX86p32_8RGB332();
extern void _ConvertX86p16_16BGR565();
extern void _ConvertX86p16_16RGB555();
extern void _ConvertX86p16_16BGR555();
extern void _ConvertX86p16_8RGB332();
#define Hermes_X86_CPU _Hermes_X86_CPU
#define ConvertX86 _ConvertX86
#define ConvertX86p32_32BGR888 _ConvertX86p32_32BGR888
#define ConvertX86p32_32RGBA888 _ConvertX86p32_32RGBA888
#define ConvertX86p32_32BGRA888 _ConvertX86p32_32BGRA888
#define ConvertX86p32_24RGB888 _ConvertX86p32_24RGB888
#define ConvertX86p32_24BGR888 _ConvertX86p32_24BGR888
#define ConvertX86p32_16RGB565 _ConvertX86p32_16RGB565
#define ConvertX86p32_16BGR565 _ConvertX86p32_16BGR565
#define ConvertX86p32_16RGB555 _ConvertX86p32_16RGB555
#define ConvertX86p32_16BGR555 _ConvertX86p32_16BGR555
#define ConvertX86p32_8RGB332 _ConvertX86p32_8RGB332
#define ConvertX86p16_16BGR565 _ConvertX86p16_16BGR565
#define ConvertX86p16_16RGB555 _ConvertX86p16_16RGB555
#define ConvertX86p16_16BGR555 _ConvertX86p16_16BGR555
#define ConvertX86p16_8RGB332 _ConvertX86p16_8RGB332
#ifdef __cplusplus
}
#endif
#endif /* ELF & GNU */
/* Make it run with WATCOM C */
#ifdef __WATCOMC__
#pragma warning 601 9
#pragma aux Hermes_X86_CPU "_*"
#pragma aux ConvertX86 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearX86_32 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearX86_24 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearX86_16 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearX86_8 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ConvertX86p32_32BGR888 "_*"
#pragma aux ConvertX86p32_32RGBA888 "_*"
#pragma aux ConvertX86p32_32BGRA888 "_*"
#pragma aux ConvertX86p32_24RGB888 "_*"
#pragma aux ConvertX86p32_24BGR888 "_*"
#pragma aux ConvertX86p32_16RGB565 "_*"
#pragma aux ConvertX86p32_16BGR565 "_*"
#pragma aux ConvertX86p32_16RGB555 "_*"
#pragma aux ConvertX86p32_16BGR555 "_*"
#pragma aux ConvertX86p32_8RGB332 "_*"
#pragma aux ConvertX86p16_32RGB888 "_*"
#pragma aux ConvertX86p16_32BGR888 "_*"
#pragma aux ConvertX86p16_32RGBA888 "_*"
#pragma aux ConvertX86p16_32BGRA888 "_*"
#pragma aux ConvertX86p16_24RGB888 "_*"
#pragma aux ConvertX86p16_24BGR888 "_*"
#pragma aux ConvertX86p16_16BGR565 "_*"
#pragma aux ConvertX86p16_16RGB555 "_*"
#pragma aux ConvertX86p16_16BGR555 "_*"
#pragma aux ConvertX86p16_8RGB332 "_*"
#pragma aux CopyX86p_4byte "_*"
#pragma aux CopyX86p_3byte "_*"
#pragma aux CopyX86p_2byte "_*"
#pragma aux CopyX86p_1byte "_*"
#pragma aux ConvertX86pI8_32 "_*"
#pragma aux ConvertX86pI8_24 "_*"
#pragma aux ConvertX86pI8_16 "_*"
#pragma aux ConvertX86p16_32RGB888_LUT_X86 "_*"
#pragma aux ConvertX86p16_32BGR888_LUT_X86 "_*"
#pragma aux ConvertX86p16_32RGBA888_LUT_X86 "_*"
#pragma aux ConvertX86p16_32BGRA888_LUT_X86 "_*"
#endif /* __WATCOMC__ */
#endif /* X86_ASSEMBLER */
#endif
/* vi: set ts=4 sw=4 expandtab: */
HERMES 1.2.4 (c)1998 Christian Nentwich (brn) (c.nentwich@cs.ucl.ac.uk)
and quite a few assembler routines (c) Glenn Fielder (gaffer@gaffer.org)
This library and all the files enclosed in this package are free software
under the terms of the GNU Library General Public License (LGPL). Please
refer to the included file COPYING.LIB for the exact terms.
----------------------------------------------------------------------------
This is a stripped down version of HERMES, including only the x86 assembler
converters, for use with Simple DirectMedia Layer.
The full HERMES library is available at: http://hermes.terminal.at/
; Some common macros for hermes nasm code
%macro SDL_FUNC 1
%ifdef HIDDEN_VISIBILITY
GLOBAL %1:function hidden
%else
GLOBAL %1
%endif
%endmacro
;
; mmx format converter main loops for HERMES
; Some routines Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
; This source code is licensed under the GNU LGPL
;
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions
;
BITS 32
%include "common.inc"
SDL_FUNC _ConvertMMX
SECTION .text
;; _ConvertMMX:
;; [ESP+8] ConverterInfo*
;; --------------------------------------------------------------------------
;; ConverterInfo (ebp+..)
;; 0: void *s_pixels
;; 4: int s_width
;; 8: int s_height
;; 12: int s_add
;; 16: void *d_pixels
;; 20: int d_width
;; 24: int d_height
;; 28: int d_add
;; 32: void (*converter_function)()
;; 36: int32 *lookup
_ConvertMMX:
push ebp
mov ebp,esp
; Save the registers used by the blitters, necessary for optimized code
pusha
mov eax,[ebp+8]
cmp dword [eax+4],BYTE 0
je endconvert
mov ebp,eax
mov esi,[ebp+0]
mov edi,[ebp+16]
y_loop:
mov ecx,[ebp+4]
call [ebp+32]
add esi,[ebp+12]
add edi,[ebp+28]
dec dword [ebp+8]
jnz y_loop
; Restore the registers used by the blitters, necessary for optimized code
popa
pop ebp
endconvert:
emms
ret
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
;
; pII-optimised MMX format converters for HERMES
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
; This source code is licensed under the GNU LGPL
;
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions
;
; COPYRIGHT NOTICE
;
; This file partly contains code that is (c) Intel Corporation, specifically
; the mode detection routine, and the converter to 15 bit (8 pixel
; conversion routine from the mmx programming tutorial pages).
;
;
; These routines aren't exactly pII optimised - it's just that as they
; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
; optimise them for p5 MMXs..
BITS 32
%include "common.inc"
SDL_FUNC _ConvertMMXpII32_24RGB888
SDL_FUNC _ConvertMMXpII32_16RGB565
SDL_FUNC _ConvertMMXpII32_16BGR565
SDL_FUNC _ConvertMMXpII32_16RGB555
SDL_FUNC _ConvertMMXpII32_16BGR555
;; Macros for conversion routines
%macro _push_immq_mask 1
push dword %1
push dword %1
%endmacro
%macro load_immq 2
_push_immq_mask %2
movq %1, [esp]
%endmacro
%macro pand_immq 2
_push_immq_mask %2
pand %1, [esp]
%endmacro
%define CLEANUP_IMMQ_LOADS(num) \
add esp, byte 8 * num
%define mmx32_rgb888_mask 00ffffffh
%define mmx32_rgb565_b 000000f8h
%define mmx32_rgb565_g 0000fc00h
%define mmx32_rgb565_r 00f80000h
%define mmx32_rgb555_rb 00f800f8h
%define mmx32_rgb555_g 0000f800h
%define mmx32_rgb555_mul 20000008h
%define mmx32_bgr555_mul 00082000h
SECTION .text
_ConvertMMXpII32_24RGB888:
; set up mm6 as the mask, mm7 as zero
load_immq mm6, mmx32_rgb888_mask
CLEANUP_IMMQ_LOADS(1)
pxor mm7, mm7
mov edx, ecx ; save ecx
and ecx, 0fffffffch ; clear lower two bits
jnz .L1
jmp .L2
.L1:
movq mm0, [esi] ; A R G B a r g b
pand mm0, mm6 ; 0 R G B 0 r g b
movq mm1, [esi+8] ; A R G B a r g b
pand mm1, mm6 ; 0 R G B 0 r g b
movq mm2, mm0 ; 0 R G B 0 r g b
punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
psllq mm2, 24 ; 0 0 R G B 0 0 0
por mm0, mm2 ; 0 0 R G B r g b
movq mm3, mm1 ; 0 R G B 0 r g b
psllq mm3, 48 ; g b 0 0 0 0 0 0
por mm0, mm3 ; g b R G B r g b
movq mm4, mm1 ; 0 R G B 0 r g b
punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
psrlq mm1, 16 ; 0 0 0 R G B 0 r
psllq mm4, 8 ; 0 0 0 0 R G B 0
por mm1, mm4 ; 0 0 0 0 R G B r
movq [edi], mm0
add esi, BYTE 16
movd [edi+8], mm1
add edi, BYTE 12
sub ecx, BYTE 4
jnz .L1
.L2:
mov ecx, edx
and ecx, BYTE 3
jz .L4
.L3:
mov al, [esi]
mov bl, [esi+1]
mov dl, [esi+2]
mov [edi], al
mov [edi+1], bl
mov [edi+2], dl
add esi, BYTE 4
add edi, BYTE 3
dec ecx
jnz .L3
.L4:
return
_ConvertMMXpII32_16RGB565:
; set up masks
load_immq mm5, mmx32_rgb565_b
load_immq mm6, mmx32_rgb565_g
load_immq mm7, mmx32_rgb565_r
CLEANUP_IMMQ_LOADS(3)
mov edx, ecx
shr ecx, 2
jnz .L1
jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
.L1:
movq mm0, [esi] ; argb
movq mm1, mm0 ; argb
pand mm0, mm6 ; 00g0
movq mm3, mm1 ; argb
pand mm1, mm5 ; 000b
pand mm3, mm7 ; 0r00
pslld mm1, 2 ; 0 0 000000bb bbb00000
por mm0, mm1 ; 0 0 ggggggbb bbb00000
psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
movq mm4, [esi+8] ; argb
movq mm2, mm4 ; argb
pand mm4, mm6 ; 00g0
movq mm1, mm2 ; argb
pand mm2, mm5 ; 000b
pand mm1, mm7 ; 0r00
pslld mm2, 2 ; 0 0 000000bb bbb00000
por mm4, mm2 ; 0 0 ggggggbb bbb00000
psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
packuswb mm3, mm1 ; R 0 r 0
packssdw mm0, mm4 ; as above.. ish
por mm0, mm3 ; done.
movq [edi], mm0
add esi, 16
add edi, 8
dec ecx
jnz .L1
.L2:
mov ecx, edx
and ecx, BYTE 3
jz .L4
.L3:
mov al, [esi]
mov bh, [esi+1]
mov ah, [esi+2]
shr al, 3
and eax, 0F81Fh ; BYTE?
shr ebx, 5
and ebx, 07E0h ; BYTE?
add eax, ebx
mov [edi], al
mov [edi+1], ah
add esi, BYTE 4
add edi, BYTE 2
dec ecx
jnz .L3
.L4:
retn
_ConvertMMXpII32_16BGR565:
load_immq mm5, mmx32_rgb565_r
load_immq mm6, mmx32_rgb565_g
load_immq mm7, mmx32_rgb565_b
CLEANUP_IMMQ_LOADS(3)
mov edx, ecx
shr ecx, 2
jnz .L1
jmp .L2
.L1:
movq mm0, [esi] ; a r g b
movq mm1, mm0 ; a r g b
pand mm0, mm6 ; 0 0 g 0
movq mm3, mm1 ; a r g b
pand mm1, mm5 ; 0 r 0 0
pand mm3, mm7 ; 0 0 0 b
psllq mm3, 16 ; 0 b 0 0
psrld mm1, 14 ; 0 0 000000rr rrr00000
por mm0, mm1 ; 0 0 ggggggrr rrr00000
psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
movq mm4, [esi+8] ; a r g b
movq mm2, mm4 ; a r g b
pand mm4, mm6 ; 0 0 g 0
movq mm1, mm2 ; a r g b
pand mm2, mm5 ; 0 r 0 0
pand mm1, mm7 ; 0 0 0 b
psllq mm1, 16 ; 0 b 0 0
psrld mm2, 14 ; 0 0 000000rr rrr00000
por mm4, mm2 ; 0 0 ggggggrr rrr00000
psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
movq [edi], mm0
add esi, BYTE 16
add edi, BYTE 8
dec ecx
jnz .L1
.L2:
and edx, BYTE 3
jz .L4
.L3:
mov al, [esi+2]
mov bh, [esi+1]
mov ah, [esi]
shr al, 3
and eax, 0F81Fh ; BYTE ?
shr ebx, 5
and ebx, 07E0h ; BYTE ?
add eax, ebx
mov [edi], al
mov [edi+1], ah
add esi, BYTE 4
add edi, BYTE 2
dec edx
jnz .L3
.L4:
retn
_ConvertMMXpII32_16BGR555:
; the 16BGR555 converter is identical to the RGB555 one,
; except it uses a different multiplier for the pmaddwd
; instruction. cool huh.
load_immq mm7, mmx32_bgr555_mul
jmp _convert_bgr555_cheat
; This is the same as the Intel version.. they obviously went to
; much more trouble to expand/coil the loop than I did, so theirs
; would almost certainly be faster, even if only a little.
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
; (I think) a more accurate name..
_ConvertMMXpII32_16RGB555:
load_immq mm7, mmx32_rgb555_mul
_convert_bgr555_cheat:
load_immq mm6, mmx32_rgb555_g
CLEANUP_IMMQ_LOADS(2)
mov edx,ecx ; Save ecx
and ecx,DWORD 0fffffff8h ; clear lower three bits
jnz .L_OK
jmp near .L2
.L_OK:
movq mm2,[esi+8]
movq mm0,[esi]
movq mm3,mm2
pand_immq mm3, mmx32_rgb555_rb
movq mm1,mm0
pand_immq mm1, mmx32_rgb555_rb
pmaddwd mm3,mm7
CLEANUP_IMMQ_LOADS(2)
pmaddwd mm1,mm7
pand mm2,mm6
.L1:
movq mm4,[esi+24]
pand mm0,mm6
movq mm5,[esi+16]
por mm3,mm2
psrld mm3,6
por mm1,mm0
movq mm0,mm4
psrld mm1,6
pand_immq mm0, mmx32_rgb555_rb
packssdw mm1,mm3
movq mm3,mm5
pmaddwd mm0,mm7
pand_immq mm3, mmx32_rgb555_rb
pand mm4,mm6
movq [edi],mm1
pmaddwd mm3,mm7
add esi,BYTE 32
por mm4,mm0
pand mm5,mm6
psrld mm4,6
movq mm2,[esi+8]
por mm5,mm3
movq mm0,[esi]
psrld mm5,6
movq mm3,mm2
movq mm1,mm0
pand_immq mm3, mmx32_rgb555_rb
packssdw mm5,mm4
pand_immq mm1, mmx32_rgb555_rb
pand mm2,mm6
CLEANUP_IMMQ_LOADS(4)
movq [edi+8],mm5
pmaddwd mm3,mm7
pmaddwd mm1,mm7
add edi,BYTE 16
sub ecx,BYTE 8
jz .L2
jmp .L1
.L2:
mov ecx,edx
and ecx,BYTE 7
jz .L4
.L3:
mov ebx,[esi]
add esi,BYTE 4
mov eax,ebx
mov edx,ebx
shr eax,3
shr edx,6
and eax,BYTE 0000000000011111b
and edx, 0000001111100000b
shr ebx,9
or eax,edx
and ebx, 0111110000000000b
or eax,ebx
mov [edi],ax
add edi,BYTE 2
dec ecx
jnz .L3
.L4:
retn
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
;
; x86 format converters for HERMES
; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
; This source code is licensed under the GNU LGPL
;
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions
;
; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
;
BITS 32
%include "common.inc"
SDL_FUNC _ConvertX86
SECTION .text
;; _ConvertX86:
;; [ESP+8] ConverterInfo*
;; --------------------------------------------------------------------------
;; ConverterInfo (ebp+..)
;; 0: void *s_pixels
;; 4: int s_width
;; 8: int s_height
;; 12: int s_add
;; 16: void *d_pixels
;; 20: int d_width
;; 24: int d_height
;; 28: int d_add
;; 32: void (*converter_function)()
;; 36: int32 *lookup
_ConvertX86:
push ebp
mov ebp,esp
; Save the registers used by the blitters, necessary for optimized code
pusha
mov eax,[ebp+8]
cmp dword [eax+4],BYTE 0
je endconvert
mov ebp,eax
mov esi,[ebp+0]
mov edi,[ebp+16]
y_loop:
mov ecx,[ebp+4]
call [ebp+32]
add esi,[ebp+12]
add edi,[ebp+28]
dec dword [ebp+8]
jnz y_loop
; Restore the registers used by the blitters, necessary for optimized code
popa
pop ebp
endconvert:
ret
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
;
; x86 format converters for HERMES
; Copyright (c) 1998 Glenn Fielder (gaffer@gaffer.org)
; This source code is licensed under the GNU LGPL
;
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions
;
; Routines adjusted for Hermes by Christian Nentwich (brn@eleet.mcb.at)
; Used with permission.
;
BITS 32
%include "common.inc"
SDL_FUNC _ConvertX86p16_16BGR565
SDL_FUNC _ConvertX86p16_16RGB555
SDL_FUNC _ConvertX86p16_16BGR555
SDL_FUNC _ConvertX86p16_8RGB332
EXTERN _ConvertX86
SECTION .text
_ConvertX86p16_16BGR565:
; check short
cmp ecx,BYTE 16
ja .L3
.L1 ; short loop
mov al,[esi]
mov ah,[esi+1]
mov ebx,eax
mov edx,eax
shr eax,11
and eax,BYTE 11111b
and ebx,11111100000b
shl edx,11
add eax,ebx
add eax,edx
mov [edi],al
mov [edi+1],ah
add esi,BYTE 2
add edi,BYTE 2
dec ecx
jnz .L1
.L2
retn
.L3 ; head
mov eax,edi
and eax,BYTE 11b
jz .L4
mov al,[esi]
mov ah,[esi+1]
mov ebx,eax
mov edx,eax
shr eax,11
and eax,BYTE 11111b
and ebx,11111100000b
shl edx,11
add eax,ebx
add eax,edx
mov [edi],al
mov [edi+1],ah
add esi,BYTE 2
add edi,BYTE 2
dec ecx
.L4 ; save count
push ecx
; unroll twice
shr ecx,1
; point arrays to end
lea esi,[esi+ecx*4]
lea edi,[edi+ecx*4]
; negative counter
neg ecx
jmp SHORT .L6
.L5 mov [edi+ecx*4-4],eax
.L6 mov eax,[esi+ecx*4]
mov ebx,[esi+ecx*4]
and eax,07E007E0h
mov edx,[esi+ecx*4]
and ebx,0F800F800h
shr ebx,11
and edx,001F001Fh
shl edx,11
add eax,ebx
add eax,edx
inc ecx
jnz .L5
mov [edi+ecx*4-4],eax
; tail
pop ecx
and ecx,BYTE 1
jz .L7
mov al,[esi]
mov ah,[esi+1]
mov ebx,eax
mov edx,eax
shr eax,11
and eax,BYTE 11111b
and ebx,11111100000b
shl edx,11
add eax,ebx
add eax,edx
mov [edi],al
mov [edi+1],ah
add esi,BYTE 2
add edi,BYTE 2
.L7
retn
_ConvertX86p16_16RGB555:
; check short
cmp ecx,BYTE 32
ja .L3
.L1 ; short loop
mov al,[esi]
mov ah,[esi+1]
mov ebx,eax
shr ebx,1
and ebx, 0111111111100000b
and eax,BYTE 0000000000011111b
add eax,ebx
mov [edi],al
mov [edi+1],ah
add esi,BYTE 2
add edi,BYTE 2
dec ecx
jnz .L1
.L2
retn
.L3 ; head
mov eax,edi
and eax,BYTE 11b
jz .L4
mov al,[esi]
mov ah,[esi+1]
mov ebx,eax
shr ebx,1
and ebx, 0111111111100000b
and eax,BYTE 0000000000011111b
add eax,ebx
mov [edi],al
mov [edi+1],ah
add esi,BYTE 2
add edi,BYTE 2
dec ecx
.L4 ; save ebp
push ebp
; save count
push ecx
; unroll four times
shr ecx,2
; point arrays to end
lea esi,[esi+ecx*8]
lea edi,[edi+ecx*8]
; negative counter
xor ebp,ebp
sub ebp,ecx
.L5 mov eax,[esi+ebp*8] ; agi?
mov ecx,[esi+ebp*8+4]
mov ebx,eax
mov edx,ecx
and eax,0FFC0FFC0h
and ecx,0FFC0FFC0h
shr eax,1
and ebx,001F001Fh
shr ecx,1
and edx,001F001Fh
add eax,ebx
add ecx,edx
mov [edi+ebp*8],eax
mov [edi+ebp*8+4],ecx
inc ebp
jnz .L5
; tail
pop ecx
.L6 and ecx,BYTE 11b
jz .L7
mov al,[esi]
mov ah,[esi+1]
mov ebx,eax
shr ebx,1
and ebx, 0111111111100000b
and eax,BYTE 0000000000011111b
add eax,ebx
mov [edi],al
mov [edi+1],ah
add esi,BYTE 2
add edi,BYTE 2
dec ecx
jmp SHORT .L6
.L7 pop ebp
retn
_ConvertX86p16_16BGR555:
; check short
cmp ecx,BYTE 16
ja .L3
.L1 ; short loop
mov al,[esi]
mov ah,[esi+1]
mov ebx,eax
mov edx,eax
shr eax,11
and eax,BYTE 11111b
shr ebx,1
and ebx,1111100000b
shl edx,10
and edx,0111110000000000b
add eax,ebx
add eax,edx
mov [edi],al
mov [edi+1],ah
add esi,BYTE 2
add edi,BYTE 2
dec ecx
jnz .L1
.L2
retn
.L3 ; head
mov eax,edi
and eax,BYTE 11b
jz .L4
mov al,[esi]
mov ah,[esi+1]
mov ebx,eax
mov edx,eax
shr eax,11
and eax,BYTE 11111b
shr ebx,1
and ebx,1111100000b
shl edx,10
and edx,0111110000000000b
add eax,ebx
add eax,edx
mov [edi],al
mov [edi+1],ah
add esi,BYTE 2
add edi,BYTE 2
dec ecx
.L4 ; save count
push ecx
; unroll twice
shr ecx,1
; point arrays to end
lea esi,[esi+ecx*4]
lea edi,[edi+ecx*4]
; negative counter
neg ecx
jmp SHORT .L6
.L5 mov [edi+ecx*4-4],eax
.L6 mov eax,[esi+ecx*4]
shr eax,1
mov ebx,[esi+ecx*4]
and eax,03E003E0h
mov edx,[esi+ecx*4]
and ebx,0F800F800h
shr ebx,11
and edx,001F001Fh
shl edx,10
add eax,ebx
add eax,edx
inc ecx
jnz .L5
mov [edi+ecx*4-4],eax
; tail
pop ecx
and ecx,BYTE 1
jz .L7
mov al,[esi]
mov ah,[esi+1]
mov ebx,eax
mov edx,eax
shr eax,11
and eax,BYTE 11111b
shr ebx,1
and ebx,1111100000b
shl edx,10
and edx,0111110000000000b
add eax,ebx
add eax,edx
mov [edi],al
mov [edi+1],ah
add esi,BYTE 2
add edi,BYTE 2
.L7
retn
_ConvertX86p16_8RGB332:
; check short
cmp ecx,BYTE 16
ja .L3
.L1 ; short loop
mov al,[esi+0]
mov ah,[esi+1]
mov ebx,eax
mov edx,eax
and eax,BYTE 11000b ; blue
shr eax,3
and ebx,11100000000b ; green
shr ebx,6
and edx,1110000000000000b ; red
shr edx,8
add eax,ebx
add eax,edx
mov [edi],al
add esi,BYTE 2
inc edi
dec ecx
jnz .L1
.L2
retn
.L3 mov eax,edi
and eax,BYTE 11b
jz .L4
mov al,[esi+0]
mov ah,[esi+1]
mov ebx,eax
mov edx,eax
and eax,BYTE 11000b ; blue
shr eax,3
and ebx,11100000000b ; green
shr ebx,6
and edx,1110000000000000b ; red
shr edx,8
add eax,ebx
add eax,edx
mov [edi],al
add esi,BYTE 2
inc edi
dec ecx
jmp SHORT .L3
.L4 ; save ebp
push ebp
; save count
push ecx
; unroll 4 times
shr ecx,2
; prestep
mov dl,[esi+0]
mov bl,[esi+1]
mov dh,[esi+2]
.L5 shl edx,16
mov bh,[esi+3]
shl ebx,16
mov dl,[esi+4]
mov dh,[esi+6]
mov bl,[esi+5]
and edx,00011000000110000001100000011000b
mov bh,[esi+7]
ror edx,16+3
mov eax,ebx ; setup eax for reds
and ebx,00000111000001110000011100000111b
and eax,11100000111000001110000011100000b ; reds
ror ebx,16-2
add esi,BYTE 8
ror eax,16
add edi,BYTE 4
add eax,ebx
mov bl,[esi+1] ; greens
add eax,edx
mov dl,[esi+0] ; blues
mov [edi-4],eax
mov dh,[esi+2]
dec ecx
jnz .L5
; check tail
pop ecx
and ecx,BYTE 11b
jz .L7
.L6 ; tail
mov al,[esi+0]
mov ah,[esi+1]
mov ebx,eax
mov edx,eax
and eax,BYTE 11000b ; blue
shr eax,3
and ebx,11100000000b ; green
shr ebx,6
and edx,1110000000000000b ; red
shr edx,8
add eax,ebx
add eax,edx
mov [edi],al
add esi,BYTE 2
inc edi
dec ecx
jnz .L6
.L7 pop ebp
retn
%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif
This diff is collapsed.
......@@ -24,6 +24,7 @@
#include "SDL_video.h"
#include "SDL_sysvideo.h"
#include "SDL_blit.h"
#include "SDL_blit_copy.h"
#include "SDL_RLEaccel_c.h"
#include "SDL_pixels_c.h"
......@@ -106,111 +107,64 @@ SDL_SoftBlit(SDL_Surface * src, SDL_Rect * srcrect,
return (okay ? 0 : -1);
}
#ifdef MMX_ASMBLIT
static __inline__ void
SDL_memcpyMMX(Uint8 * to, const Uint8 * from, int len)
{
int i;
#ifdef __MACOSX__
#include <sys/sysctl.h>
for (i = 0; i < len / 8; i++) {
__asm__ __volatile__(" movq (%0), %%mm0\n"
" movq %%mm0, (%1)\n"::"r"(from),
"r"(to):"memory");
from += 8;
to += 8;
}
if (len & 7)
SDL_memcpy(to, from, len & 7);
}
static __inline__ void
SDL_memcpySSE(Uint8 * to, const Uint8 * from, int len)
static SDL_bool SDL_UseAltivecPrefetch()
{
int i;
__asm__ __volatile__(" prefetchnta (%0)\n"
" prefetchnta 64(%0)\n"
" prefetchnta 128(%0)\n"
" prefetchnta 192(%0)\n"::"r"(from));
const char key[] = "hw.l3cachesize";
u_int64_t result = 0;
size_t typeSize = sizeof(result);
for (i = 0; i < len / 8; i++) {
__asm__ __volatile__(" prefetchnta 256(%0)\n"
" movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"::"r"(from),
"r"(to):"memory");
from += 8;
to += 8;
if (sysctlbyname(key, &result, &typeSize, NULL, 0) == 0 && result > 0) {
return SDL_TRUE;
} else {
return SDL_FALSE;
}
if (len & 7)
SDL_memcpy(to, from, len & 7);
}
#endif
static void
SDL_BlitCopy(SDL_BlitInfo * info)
#else
static SDL_bool SDL_UseAltivecPrefetch()
{
Uint8 *src, *dst;
int w, h;
int srcskip, dstskip;
w = info->d_width * info->dst->BytesPerPixel;
h = info->d_height;
src = info->s_pixels;
dst = info->d_pixels;
srcskip = w + info->s_skip;
dstskip = w + info->d_skip;
#ifdef MMX_ASMBLIT
if (SDL_HasSSE()) {
while (h--) {
SDL_memcpySSE(dst, src, w);
src += srcskip;
dst += dstskip;
}
__asm__ __volatile__(" emms\n"::);
} else if (SDL_HasMMX()) {
while (h--) {
SDL_memcpyMMX(dst, src, w);
src += srcskip;
dst += dstskip;
}
__asm__ __volatile__(" emms\n"::);
} else
#endif
while (h--) {
SDL_memcpy(dst, src, w);
src += srcskip;
dst += dstskip;
}
/* Just guess G4 */
return SDL_TRUE;
}
#endif /* __MACOSX__ */
static void
SDL_BlitCopyOverlap(SDL_BlitInfo * info)
static SDL_loblit SDL_ChooseBlitFunc(SDL_BlitEntry *entries, int count)
{
Uint8 *src, *dst;
int w, h;
int srcskip, dstskip;
int i;
static Uint32 features = 0xffffffff;
if (features == 0xffffffff) {
features = SDL_BLIT_ANY;
w = info->d_width * info->dst->BytesPerPixel;
h = info->d_height;
src = info->s_pixels;
dst = info->d_pixels;
srcskip = w + info->s_skip;
dstskip = w + info->d_skip;
if (dst < src) {
while (h--) {
SDL_memcpy(dst, src, w);
src += srcskip;
dst += dstskip;
/* Provide an override for testing .. */
const char *override = SDL_getenv("SDL_BLIT_FEATURES");
if (override) {
SDL_sscanf(override, "%u", &features);
} else {
if (SDL_HasMMX()) {
features |= SDL_BLIT_MMX;
}
if (SDL_HasSSE()) {
features |= SDL_BLIT_SSE;
}
if (SDL_HasAltivec()) {
if (SDL_UseAltivecPrefetch()) {
features |= SDL_BLIT_ALTIVEC_PREFETCH;
} else {
features |= SDL_BLIT_ALTIVEC_NOPREFETCH;
}
}
}
} else {
src += ((h - 1) * srcskip);
dst += ((h - 1) * dstskip);
while (h--) {
SDL_revcpy(dst, src, w);
src -= srcskip;
dst -= dstskip;
}
for (i = count; i > 0; --i) {
if (features & entries[i].features) {
return entries[i].blit;
}
}
return entries[0].blit;
}
/* Figure out which of many blit routines to set up on a surface */
......@@ -237,11 +191,11 @@ SDL_CalculateBlit(SDL_Surface * surface)
/* Check for special "identity" case -- copy blit */
if (surface->map->identity && blit_index == 0) {
surface->map->sw_data->blit = SDL_BlitCopy;
/* Handle overlapping blits on the same surface */
if (surface == surface->map->dst) {
surface->map->sw_data->blit = SDL_BlitCopyOverlap;
} else {
surface->map->sw_data->blit = SDL_BlitCopy;
}
} else {
if (surface->format->BitsPerPixel < 8) {
......
......@@ -67,6 +67,17 @@ typedef struct SDL_BlitMap
unsigned int format_version;
} SDL_BlitMap;
#define SDL_BLIT_ANY 0x00000000
#define SDL_BLIT_MMX 0x00000001
#define SDL_BLIT_SSE 0x00000002
#define SDL_BLIT_ALTIVEC_PREFETCH 0x00000004
#define SDL_BLIT_ALTIVEC_NOPREFETCH 0x00000008
typedef struct SDL_BlitEntry
{
Uint32 features;
SDL_loblit blit;
} SDL_BlitEntry;
/* Functions found in SDL_blit.c */
extern int SDL_CalculateBlit(SDL_Surface * surface);
......
......@@ -879,19 +879,6 @@ GetBlitFeatures(void)
#define LO 1
#endif
#if SDL_HERMES_BLITTERS
/* Heheheh, we coerce Hermes into using SDL blit information */
#define X86_ASSEMBLER
#define HermesConverterInterface SDL_BlitInfo
#define HermesClearInterface void
#define STACKCALL
#include "../hermes/HeadMMX.h"
#include "../hermes/HeadX86.h"
#else
/* Special optimized blit for RGB 8-8-8 --> RGB 3-3-2 */
#define RGB888_RGB332(dst, src) { \
dst = (Uint8)((((src)&0x00E00000)>>16)| \
......@@ -1250,8 +1237,6 @@ Blit_RGB888_RGB565(SDL_BlitInfo * info)
#endif /* USE_DUFFS_LOOP */
}
#endif /* SDL_HERMES_BLITTERS */
/* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */
#define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1])
......@@ -2357,17 +2342,7 @@ static const struct blit_table normal_blit_1[] = {
{0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL},
};
static const struct blit_table normal_blit_2[] = {
#if SDL_HERMES_BLITTERS
{0x0000F800, 0x000007E0, 0x0000001F, 2, 0x0000001F, 0x000007E0,
0x0000F800,
0, ConvertX86p16_16BGR565, ConvertX86, NO_ALPHA},
{0x0000F800, 0x000007E0, 0x0000001F, 2, 0x00007C00, 0x000003E0,
0x0000001F,
0, ConvertX86p16_16RGB555, ConvertX86, NO_ALPHA},
{0x0000F800, 0x000007E0, 0x0000001F, 2, 0x0000001F, 0x000003E0,
0x00007C00,
0, ConvertX86p16_16BGR555, ConvertX86, NO_ALPHA},
#elif SDL_ALTIVEC_BLITTERS
#if SDL_ALTIVEC_BLITTERS
/* has-altivec */
{0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00000000, 0x00000000,
0x00000000,
......@@ -2397,47 +2372,6 @@ static const struct blit_table normal_blit_3[] = {
{0, 0, 0, 0, 0, 0, 0, 0, NULL, BlitNtoN, 0}
};
static const struct blit_table normal_blit_4[] = {
#if SDL_HERMES_BLITTERS
{0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0,
0x0000001F,
1, ConvertMMXpII32_16RGB565, ConvertMMX, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0,
0x0000001F,
0, ConvertX86p32_16RGB565, ConvertX86, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000007E0,
0x0000F800,
1, ConvertMMXpII32_16BGR565, ConvertMMX, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000007E0,
0x0000F800,
0, ConvertX86p32_16BGR565, ConvertX86, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
0x0000001F,
1, ConvertMMXpII32_16RGB555, ConvertMMX, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
0x0000001F,
0, ConvertX86p32_16RGB555, ConvertX86, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000003E0,
0x00007C00,
1, ConvertMMXpII32_16BGR555, ConvertMMX, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000001F, 0x000003E0,
0x00007C00,
0, ConvertX86p32_16BGR555, ConvertX86, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 3, 0x00FF0000, 0x0000FF00,
0x000000FF,
0, ConvertX86p32_24RGB888, ConvertX86, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 3, 0x000000FF, 0x0000FF00,
0x00FF0000,
0, ConvertX86p32_24BGR888, ConvertX86, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0x000000FF, 0x0000FF00,
0x00FF0000,
0, ConvertX86p32_32BGR888, ConvertX86, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0xFF000000, 0x00FF0000,
0x0000FF00,
0, ConvertX86p32_32RGBA888, ConvertX86, NO_ALPHA},
{0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0x0000FF00, 0x00FF0000,
0xFF000000,
0, ConvertX86p32_32BGRA888, ConvertX86, NO_ALPHA},
#else
#if SDL_ALTIVEC_BLITTERS
/* has-altivec | dont-use-prefetch */
{0x00000000, 0x00000000, 0x00000000, 4, 0x00000000, 0x00000000,
......@@ -2460,7 +2394,6 @@ static const struct blit_table normal_blit_4[] = {
{0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
0x0000001F,
0, NULL, Blit_RGB888_RGB555, NO_ALPHA},
#endif
/* Default for 32-bit RGB source, used if no other blitter matches */
{0, 0, 0, 0, 0, 0, 0, 0, NULL, BlitNtoN, 0}
};
......@@ -2529,12 +2462,7 @@ SDL_CalculateBlitN(SDL_Surface * surface, int blit_index)
if (surface->map->table) {
blitfun = Blit_RGB888_index8_map;
} else {
#if SDL_HERMES_BLITTERS
sdata->aux_data = ConvertX86p32_8RGB332;
blitfun = ConvertX86;
#else
blitfun = Blit_RGB888_index8;
#endif
}
} else {
blitfun = BlitNto1;
......@@ -2575,13 +2503,6 @@ SDL_CalculateBlitN(SDL_Surface * surface, int blit_index)
}
#ifdef DEBUG_ASM
#if SDL_HERMES_BLITTERS
if (blitfun == ConvertMMX)
fprintf(stderr, "Using mmx blit\n");
else if (blitfun == ConvertX86)
fprintf(stderr, "Using asm blit\n");
else
#endif
if ((blitfun == BlitNtoN) || (blitfun == BlitNto1))
fprintf(stderr, "Using C blit\n");
else
......
/*
SDL - Simple DirectMedia Layer
Copyright (C) 1997-2006 Sam Lantinga
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Sam Lantinga
slouken@libsdl.org
*/
#include "SDL_config.h"
#include "SDL_video.h"
#include "SDL_blit.h"
/* The MMX/SSE intrinsics don't give access to specific registers for
the most memory parallelism, so we'll use GCC inline assembly here...
*/
#ifndef __GNUC__
#undef __MMX__
#undef __SSE__
#endif
#ifdef __MMX__
static __inline__ void
SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
{
int i;
for (i = len / 64; i--;) {
__asm__ __volatile__ (
"prefetchnta (%0)\n"
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n"
"movq 32(%0), %%mm4\n"
"movq 40(%0), %%mm5\n"
"movq 48(%0), %%mm6\n"
"movq 56(%0), %%mm7\n"
"movntq %%mm0, (%1)\n"
"movntq %%mm1, 8(%1)\n"
"movntq %%mm2, 16(%1)\n"
"movntq %%mm3, 24(%1)\n"
"movntq %%mm4, 32(%1)\n"
"movntq %%mm5, 40(%1)\n"
"movntq %%mm6, 48(%1)\n"
"movntq %%mm7, 56(%1)\n"
:: "r" (src), "r" (dst) : "memory");
src += 64;
dst += 64;
}
if (len & 63)
SDL_memcpy(dst, src, len & 63);
}
#endif /* __MMX__ */
#ifdef __SSE__
static __inline__ void
SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
{
int i;
for (i = len / 64; i--;) {
__asm__ __volatile__ (
"prefetchnta (%0)\n"
"movaps (%0), %%xmm0\n"
"movaps 16(%0), %%xmm1\n"
"movaps 32(%0), %%xmm2\n"
"movaps 48(%0), %%xmm3\n"
"movntps %%xmm0, (%1)\n"
"movntps %%xmm1, 16(%1)\n"
"movntps %%xmm2, 32(%1)\n"
"movntps %%xmm3, 48(%1)\n"
:: "r" (src), "r" (dst) : "memory");
src += 64;
dst += 64;
}
if (len & 63)
SDL_memcpy(dst, src, len & 63);
}
#endif /* __SSE__ */
void
SDL_BlitCopy(SDL_BlitInfo * info)
{
Uint8 *src, *dst;
int w, h;
int srcskip, dstskip;
w = info->d_width * info->dst->BytesPerPixel;
h = info->d_height;
src = info->s_pixels;
dst = info->d_pixels;
srcskip = w + info->s_skip;
dstskip = w + info->d_skip;
#ifdef __SSE__
if (SDL_HasSSE() && !((uintptr_t)src & 15) && !((uintptr_t)dst & 15)) {
while (h--) {
SDL_memcpySSE(dst, src, w);
src += srcskip;
dst += dstskip;
}
return;
}
#endif
#ifdef __MMX__
if (SDL_HasMMX() && !((uintptr_t)src & 7) && !((uintptr_t)dst & 7)) {
while (h--) {
SDL_memcpyMMX(dst, src, w);
src += srcskip;
dst += dstskip;
}
__asm__ __volatile__(" emms\n"::);
return;
}
#endif
while (h--) {
SDL_memcpy(dst, src, w);
src += srcskip;
dst += dstskip;
}
}
void
SDL_BlitCopyOverlap(SDL_BlitInfo * info)
{
Uint8 *src, *dst;
int w, h;
int skip;
w = info->d_width * info->dst->BytesPerPixel;
h = info->d_height;
src = info->s_pixels;
dst = info->d_pixels;
skip = w + info->s_skip;
if ((dst < src) || (dst >= (src + h*skip))) {
SDL_BlitCopy(info);
} else {
src += ((h - 1) * skip);
dst += ((h - 1) * skip);
while (h--) {
SDL_revcpy(dst, src, w);
src -= skip;
dst -= skip;
}
}
}
/* vi: set ts=4 sw=4 expandtab: */
/*
SDL - Simple DirectMedia Layer
Copyright (C) 1997-2006 Sam Lantinga
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Sam Lantinga
slouken@libsdl.org
*/
void SDL_BlitCopy(SDL_BlitInfo * info);
void SDL_BlitCopyOverlap(SDL_BlitInfo * info);
/* vi: set ts=4 sw=4 expandtab: */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment