1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
!
! Fast SH memset
!
! by Toshiyasu Morita (tm@netcom.com)
!
! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
! Copyright 2002 SuperH Ltd.
!
#include "asm.h"
ENTRY(memset)
#if __SHMEDIA__
pta/l multiquad, tr0
andi r2, 7, r22
ptabs r18, tr2
mshflo.b r3,r3,r3
add r4, r22, r23
mperm.w r3, r63, r3 // Fill pattern now in every byte of r3
movi 8, r9
bgtu/u r23, r9, tr0
ldlo.q r2, 0, r7
shlli r4, 2, r4
movi -1, r8
SHHI r8, r4, r8
SHHI r8, r4, r8
mcmv r7, r8, r3
stlo.q r2, 0, r3
blink tr2, r63
multiquad:
pta/l lastquad, tr0
stlo.q r2, 0, r3
shlri r23, 3, r24
add r2, r4, r5
beqi/u r24, 1, tr0 // lastquad
pta/l loop, tr1
sub r2, r22, r25
andi r5, -8, r20 // calculate end address and
addi r20, -7*8, r8 // loop end address; This might overflow, so we need
// to use a different test before we start the loop
bge/u r24, r9, tr1 // loop
st.q r25, 8, r3
st.q r20, -8, r3
shlri r24, 1, r24
beqi/u r24, 1, tr0 // lastquad
st.q r25, 16, r3
st.q r20, -16, r3
beqi/u r24, 2, tr0 // lastquad
st.q r25, 24, r3
st.q r20, -24, r3
lastquad:
sthi.q r5, -1, r3
blink tr2,r63
loop:
alloco r25, 32
st.q r25, 8, r3
st.q r25, 16, r3
st.q r25, 24, r3
st.q r25, 32, r3
addi r25, 32, r25
bgeu/l r8, r25, tr1
st.q r20, -24, r3
st.q r20, -16, r3
st.q r20, -8, r3
sthi.q r5, -1, r3
blink tr2,r63
#else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */
! Entry: r4: destination pointer
! r5: fill value
! r6: byte count
!
! Exit: r0-r3: trashed
!
! This assumes that the first four bytes of the address space (0..3) are
! reserved - usually by the linker script. Otherwise, we would had to check
! for the case of objects of the size 12..15 at address 0..3 .
#ifdef __SH5__
#define DST r2
#define VAL r3
#define CNT r4
#define TMP r5
#else
#define DST r4
#define VAL r5
#define CNT r6
#define TMP r2
#endif
mov #12,r0 ! Check for small number of bytes
cmp/gt CNT,r0
mov DST,r0
SL(bt, L_store_byte_loop_check0, add DST,CNT)
tst #3,r0 ! Align destination
SL(bt, L_dup_bytes, extu.b r5,r5)
.balignw 4,0x0009
L_align_loop:
mov.b VAL,@r0
add #1,r0
tst #3,r0
bf L_align_loop
L_dup_bytes:
swap.b VAL,TMP ! Duplicate bytes across longword
or TMP,VAL
swap.w VAL,TMP
or TMP,VAL
add #-16,CNT
.balignw 4,0x0009
L_store_long_loop:
mov.l VAL,@r0 ! Store double longs to memory
cmp/hs CNT,r0
mov.l VAL,@(4,r0)
SL(bf, L_store_long_loop, add #8,r0)
add #16,CNT
L_store_byte_loop_check0:
cmp/eq CNT,r0
bt L_exit
.balignw 4,0x0009
L_store_byte_loop:
mov.b VAL,@r0 ! Store bytes to memory
add #1,r0
cmp/eq CNT,r0
bf L_store_byte_loop
L_exit:
rts
mov r4,r0
#endif /* ! SHMEDIA */
|