1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
@ ARM code version of rotation routines.
@
@ @author Robin Watts (robin@wss.co.uk)
@
@ When rotating a block of memory to the screen, the key facts to bear in
@ mind are:
@ * Screen memory is uncached - therefore to get best performance we want
@ to write runs of horizontal pixels so the write buffer can kick in and
@ merge the buffered writes.
@ * Reading large numbers of pixels to be written out horizontally, pulls
@ in lots of cache lines. We need to be careful not to cache bust here.
@ * The 16 or 32 way set associativity for screens can hurt us here too.
@
@ A good compromise is therefore to write out in bursts of 4 horizontal
@ pixels at once.
.text
.global ARM_rotate
@ Reads block of w*h pixels from srcPtr (addressed by srcPixStep,
@ srcLineStep) and stores them at dst (addressed by dstPixStep,
@ dstLineStep), converting palette by table lookup in convertPalette.
ARM_rotate:
@ r0 = destPtr
@ r1 = srcPtr
@ r2 = w
@ r3 = h
@ r4 = dstLineStep
@ r5 = srcPixStep - e.g. 480
@ r6 = srcLineStep - e.g. 2 or -2
MOV r12,r13
STMFD r13!,{r4-r11,r14}
LDMFD r12,{r4-r6}
@ For simplicity, we will think about width/height in terms of
@ destination.
AND r7,r0,#6
MOV r7,r7,LSR #1
AND r7,r7,#3 @ r7 = Number over a multiple of 4 we start on
RSB r7,r7,#4 @ r7 = Number to do first time.
rotate_loop:
CMP r7,r2
MOVGT r7,r2 @ r7 = width to do this time
SUBS r7,r7,#4 @ r7 = width-4
BLT thin @ less than 4 pixels wide
SUB r8,r4,#6 @ r8 = dstLineStep-6
x_loop_4:
@ In this routine we will copy a 4 pixel wide stripe
ADD r9,r5,r5,LSL #1 @ r9 = 3*srcPixStep
SUB r9,r6,r9 @ r9 = srcLineStep-3*srcPixStep
MOV r7,r3 @ r7 = h
y_loop_4:
@ r9 >= 0, so at least 4 to do.
LDRH r10,[r1],r5 @ r10 = *(src)
LDRH r11,[r1],r5 @ r11 = *(src+srcPixStep)
LDRH r12,[r1],r5 @ r12 = *(src+srcPixStep*2)
LDRH r14,[r1],r9 @ r14 = *(src+srcPixStep*3) src+=srcLineStep
STRH r10,[r0],#2 @ *(ptr) = r10
STRH r11,[r0],#2 @ *(ptr+2) = r11
STRH r12,[r0],#2 @ *(ptr+4) = r12
STRH r14,[r0],r8 @ *(ptr+6) = r14 ptr += dstLineStep
SUBS r7,r7,#1 @ h--
BGT y_loop_4
MUL r10,r3,r6
ADD r1,r1,r5,LSL #2
SUB r1,r1,r10
MUL r10,r3,r4
ADD r0,r0,#8
SUB r0,r0,r10
SUBS r2,r2,#4 @ r2 = w -= 4
BEQ rotate_end @ if w = 0, none left.
SUBS r7,r2,#4 @ r7 = w - 4
BGE x_loop_4 @ if 4 or more left, go back.
thin:
MOV r14,r3 @ r14 = h
thin_lp:
ADDS r7,r7,#2 @ Always do 1. GE => do 2. GT => do 3
BGE just_2
BGT just_3
@ Just do a 1 pixel wide stripe. Either the last pixel stripe, or
@ the first pixel stripe to align us.
y_loop_1:
LDRH r10,[r1],r6
SUBS r14,r14,#1
STRH r10,[r0],r4
BGT y_loop_1
MUL r10,r3,r6 @ How much to step r1 back to undo this line?
ADD r1,r1,r5 @ Move r1 on by srcPixStep
SUB r1,r1,r10 @ Move r1 back by amount just added on
MUL r10,r3,r4 @ How much to step r0 back to undo this line?
ADD r0,r0,#2 @ Move r0 on by dstPixStep
SUB r0,r0,r10 @ Move r0 back by amount just added on
SUBS r2,r2,#1 @ If we havent finished (i.e. we were doing
MOV r7,r2 @ the first pixel rather than the last one)
BGT rotate_loop @ then jump back to do some more
rotate_end:
LDMFD r13!,{r4-r11,PC}
just_2:
@ Just do a 2 pixel wide stripe. Either the last stripe, or
@ the first stripe to align us.
SUB r9,r6,r5 @ r9 = srcLineStep - srcPixStep
SUB r8,r4,#2 @ r8 = dstLineStep - 2
y_loop_2:
LDRH r10,[r1],r5
LDRH r11,[r1],r9
SUBS r14,r14,#1
STRH r10,[r0],#2
STRH r11,[r0],r8
BGT y_loop_2
MUL r10,r3,r6 @ How much to step r1 back to undo this line?
ADD r1,r1,r5,LSL #1 @ Move r1 on by srcPixStep*2
SUB r1,r1,r10 @ Move r1 back by amount just added on
MUL r10,r3,r4 @ How much to step r0 back to undo this line?
ADD r0,r0,#4 @ Move r0 on by dstPixStep*2
SUB r0,r0,r10 @ Move r0 back by amount just added on
SUBS r2,r2,#2 @ If we havent finished (i.e. we were doing
MOV r7,r2 @ the first stripe rather than the last one)
BGT rotate_loop @ then jump back to do some more
LDMFD r13!,{r4-r11,PC}
just_3:
SUB r9,r6,r5,LSL #1 @ r9 = srcLineStep - srcPixStep
SUB r8,r4,#4 @ r8 = dstLineStep - 2
y_loop_3:
LDRH r10,[r1],r5
LDRH r11,[r1],r5
LDRH r12,[r1],r9
STRH r10,[r0],#2
STRH r11,[r0],#2
STRH r12,[r0],r8
SUBS r14,r14,#1
BGT y_loop_3
MUL r10,r3,r6 @ How much to step r1 back to undo this line?
ADD r1,r1,r5 @ Move r1 on by srcPixStep*3
ADD r1,r1,r5,LSL #1
SUB r1,r1,r10 @ Move r1 back by amount just added on
MUL r10,r3,r4 @ How much to step r0 back to undo this line?
ADD r0,r0,#6 @ Move r0 on by dstPixStep*3
SUB r0,r0,r10 @ Move r0 back by amount just added on
SUBS r2,r2,#3 @ If we havent finished (i.e. we were doing
MOV r7,r2 @ the first stripe rather than the last one)
BGT rotate_loop @ then jump back to do some more
LDMFD r13!,{r4-r11,PC}