1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
|
! Copyright (C) 2008-2012 Imagination Technologies Ltd.
.text
.global _memcpy
.type _memcpy,function
! D1Ar1 dst
! D0Ar2 src
! D1Ar3 cnt
! D0Re0 dst
_memcpy:
CMP D1Ar3, #16
MOV A1.2, D0Ar2 ! source pointer
MOV A0.2, D1Ar1 ! destination pointer
MOV A0.3, D1Ar1 ! for return value
! If there are less than 16 bytes to copy use the byte copy loop
BGE $Llong_copy
$Lbyte_copy:
! Simply copy a byte at a time
SUBS TXRPT, D1Ar3, #1
BLT $Lend
$Lloop_byte:
GETB D1Re0, [A1.2++]
SETB [A0.2++], D1Re0
BR $Lloop_byte
$Lend:
! Finally set return value and return
MOV D0Re0, A0.3
MOV PC, D1RtP
$Llong_copy:
ANDS D1Ar5, D1Ar1, #7 ! test destination alignment
BZ $Laligned_dst
! The destination address is not 8 byte aligned. We will copy bytes from
! the source to the destination until the remaining data has an 8 byte
! destination address alignment (i.e we should never copy more than 7
! bytes here).
$Lalign_dst:
GETB D0Re0, [A1.2++]
ADD D1Ar5, D1Ar5, #1 ! dest is aligned when D1Ar5 reaches #8
SUB D1Ar3, D1Ar3, #1 ! decrement count of remaining bytes
SETB [A0.2++], D0Re0
CMP D1Ar5, #8
BNE $Lalign_dst
! We have at least (16 - 7) = 9 bytes to copy - calculate the number of 8 byte
! blocks, then jump to the unaligned copy loop or fall through to the aligned
! copy loop as appropriate.
$Laligned_dst:
MOV D0Ar4, A1.2
LSR D1Ar5, D1Ar3, #3 ! D1Ar5 = number of 8 byte blocks
ANDS D0Ar4, D0Ar4, #7 ! test source alignment
BNZ $Lunaligned_copy ! if unaligned, use unaligned copy loop
! Both source and destination are 8 byte aligned - the easy case.
$Laligned_copy:
LSRS D1Ar5, D1Ar3, #5 ! D1Ar5 = number of 32 byte blocks
BZ $Lbyte_copy
SUB TXRPT, D1Ar5, #1
$Laligned_32:
GETL D0Re0, D1Re0, [A1.2++]
GETL D0Ar6, D1Ar5, [A1.2++]
SETL [A0.2++], D0Re0, D1Re0
SETL [A0.2++], D0Ar6, D1Ar5
GETL D0Re0, D1Re0, [A1.2++]
GETL D0Ar6, D1Ar5, [A1.2++]
SETL [A0.2++], D0Re0, D1Re0
SETL [A0.2++], D0Ar6, D1Ar5
BR $Laligned_32
! If there are any remaining bytes use the byte copy loop, otherwise we are done
ANDS D1Ar3, D1Ar3, #0x1f
BNZ $Lbyte_copy
B $Lend
! The destination is 8 byte aligned but the source is not, and there are 8
! or more bytes to be copied.
$Lunaligned_copy:
! Adjust the source pointer (A1.2) to the 8 byte boundary before its
! current value
MOV D0Ar4, A1.2
MOV D0Ar6, A1.2
ANDMB D0Ar4, D0Ar4, #0xfff8
MOV A1.2, D0Ar4
! Save the number of bytes of mis-alignment in D0Ar4 for use later
SUBS D0Ar6, D0Ar6, D0Ar4
MOV D0Ar4, D0Ar6
! if there is no mis-alignment after all, use the aligned copy loop
BZ $Laligned_copy
! prefetch 8 bytes
GETL D0Re0, D1Re0, [A1.2]
SUB TXRPT, D1Ar5, #1
! There are 3 mis-alignment cases to be considered. Less than 4 bytes, exactly
! 4 bytes, and more than 4 bytes.
CMP D0Ar6, #4
BLT $Lunaligned_1_2_3 ! use 1-3 byte mis-alignment loop
BZ $Lunaligned_4 ! use 4 byte mis-alignment loop
! The mis-alignment is more than 4 bytes
$Lunaligned_5_6_7:
SUB D0Ar6, D0Ar6, #4
! Calculate the bit offsets required for the shift operations necesssary
! to align the data.
! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
MULW D0Ar6, D0Ar6, #8
MOV D1Ar5, #32
SUB D1Ar5, D1Ar5, D0Ar6
! Move data 4 bytes before we enter the main loop
MOV D0Re0, D1Re0
$Lloop_5_6_7:
GETL D0Ar2, D1Ar1, [++A1.2]
! form 64-bit data in D0Re0, D1Re0
LSR D0Re0, D0Re0, D0Ar6
MOV D1Re0, D0Ar2
LSL D1Re0, D1Re0, D1Ar5
ADD D0Re0, D0Re0, D1Re0
LSR D0Ar2, D0Ar2, D0Ar6
LSL D1Re0, D1Ar1, D1Ar5
ADD D1Re0, D1Re0, D0Ar2
SETL [A0.2++], D0Re0, D1Re0
MOV D0Re0, D1Ar1
BR $Lloop_5_6_7
B $Lunaligned_end
$Lunaligned_1_2_3:
! Calculate the bit offsets required for the shift operations necesssary
! to align the data.
! D0Ar6 = bit offset, D1Ar5 = (32 - bit offset)
MULW D0Ar6, D0Ar6, #8
MOV D1Ar5, #32
SUB D1Ar5, D1Ar5, D0Ar6
$Lloop_1_2_3:
! form 64-bit data in D0Re0,D1Re0
LSR D0Re0, D0Re0, D0Ar6
LSL D1Ar1, D1Re0, D1Ar5
ADD D0Re0, D0Re0, D1Ar1
MOV D0Ar2, D1Re0
LSR D0FrT, D0Ar2, D0Ar6
GETL D0Ar2, D1Ar1, [++A1.2]
MOV D1Re0, D0Ar2
LSL D1Re0, D1Re0, D1Ar5
ADD D1Re0, D1Re0, D0FrT
SETL [A0.2++], D0Re0, D1Re0
MOV D0Re0, D0Ar2
MOV D1Re0, D1Ar1
BR $Lloop_1_2_3
B $Lunaligned_end
! The 4 byte mis-alignment case - this does not require any shifting, just a
! shuffling of registers.
$Lunaligned_4:
MOV D0Re0, D1Re0
$Lloop_4:
GETL D0Ar2, D1Ar1, [++A1.2]
MOV D1Re0, D0Ar2
SETL [A0.2++], D0Re0, D1Re0
MOV D0Re0, D1Ar1
BR $Lloop_4
$Lunaligned_end:
! If there are no remaining bytes to copy, we are done.
ANDS D1Ar3, D1Ar3, #7
BZ $Lend
! Re-adjust the source pointer (A1.2) back to the actual (unaligned) byte
! address of the remaining bytes, and fall through to the byte copy loop.
MOV D0Ar6, A1.2
ADD D1Ar5, D0Ar4, D0Ar6
MOV A1.2, D1Ar5
B $Lbyte_copy
.size _memcpy,.-_memcpy
|