• R/O
  • HTTP
  • SSH
  • HTTPS

Commit

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

Commit MetaInfo

Revision663b8a0497c40a20668258bd69db13924c569c41 (tree)
Time2022-10-14 16:47:02
AuthorPavel Kozlov <pavel.kozlov@syno...>
CommiterWaldemar Brodkorb

Log Message

arc: add optimized string functions for ARCv3

Add ability to use optimized versions of string functions for ARCv3 32-bit
CPUs with UCLIBC_HAS_STRING_ARCH_OPT option. Add optimized
memcpy/memset/memcmp code for ARCv3 CPUs based on the code from newlib
and adapt for ARCv3 existed optimized strchr/strcmp/strcpy/strlen.

Link to the Synopsys newlib repo with code for ARCv3 on GitHub:
https://github.com/foss-for-synopsys-dwc-arc-processors/newlib

Signed-off-by: Pavel Kozlov <pavel.kozlov@synopsys.com>

Change Summary

Incremental Difference

--- a/libc/string/arc/memcmp.S
+++ b/libc/string/arc/memcmp.S
@@ -1,5 +1,5 @@
11 /*
2- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
2+ * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
33 * Copyright (C) 2007 ARC International (UK) LTD
44 *
55 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -17,6 +17,8 @@
1717 #endif
1818
1919 ENTRY(memcmp)
20+
21+#if defined(__ARC700__) || defined(__ARCHS__)
2022 or r12,r0,r1
2123 asl_s r12,r12,30
2224 sub r3,r2,1
@@ -149,6 +151,96 @@ ENTRY(memcmp)
149151 .Lnil:
150152 j_s.d [blink]
151153 mov r0,0
154+
155+#elif (__ARC64_ARCH32__)
156+ ;; Based on Synopsys code from newlib's arc64/memcmp.S
157+ cmp r2, 32
158+ bls.d @.L_compare_1_bytes
159+ mov r3, r0 ; "r0" will be used as return value
160+
161+ lsr r12, r2, 4 ; counter for 16-byte chunks
162+ xor r13, r13, r13 ; the mask showing inequal registers
163+
164+.L_compare_16_bytes:
165+ ld.ab r4, [r3, +4]
166+ ld.ab r5, [r1, +4]
167+ ld.ab r6, [r3, +4]
168+ ld.ab r7, [r1, +4]
169+ ld.ab r8, [r3, +4]
170+ ld.ab r9, [r1, +4]
171+ ld.ab r10, [r3, +4]
172+ ld.ab r11, [r1, +4]
173+ xor.f 0, r4, r5
174+ xor.ne r13, r13, 0b0001
175+ xor.f 0, r6, r7
176+ xor.ne r13, r13, 0b0010
177+ xor.f 0, r8, r9
178+ xor.ne r13, r13, 0b0100
179+ xor.f 0, r10, r11
180+ xor.ne r13, r13, 0b1000
181+ brne r13, 0, @.L_unequal_find
182+ dbnz r12, @.L_compare_16_bytes
183+
184+ ;; Adjusting the pointers because of the extra loads in the end
185+ sub r1, r1, 4
186+ sub r3, r3, 4
187+ bmsk_s r2, r2, 3 ; any remaining bytes to compare
188+
189+.L_compare_1_bytes:
190+ cmp r2, 0
191+ jeq.d [blink]
192+ xor_s r0, r0, r0
193+
194+2:
195+ ldb.ab r4, [r3, +1]
196+ ldb.ab r5, [r1, +1]
197+ sub.f r0, r4, r5
198+ jne [blink]
199+ dbnz r2, @2b
200+ j_s [blink]
201+
202+ ;; At this point, we want to find the _first_ comparison that marked the
203+ ;; inequality of "lhs" and "rhs"
204+.L_unequal_find:
205+ ffs r13, r13
206+ asl r13, r13, 2
207+ bi [r13]
208+.L_unequal_r4r5:
209+ mov r1, r4
210+ b.d @.L_diff_byte_in_regs
211+ mov r2, r5
212+ nop
213+.L_unequal_r6r7:
214+ mov r1, r6
215+ b.d @.L_diff_byte_in_regs
216+ mov r2, r7
217+ nop
218+.L_unequal_r8r9:
219+ mov r1, r8
220+ b.d @.L_diff_byte_in_regs
221+ mov r2, r9
222+ nop
223+.L_unequal_r10r11:
224+ mov r1, r10
225+ mov r2, r11
226+
227+ ;; fall-through
228+ ;; If we're here, that means the two operands are not equal.
229+.L_diff_byte_in_regs:
230+ xor r0, r1, r2
231+ ffs r0, r0
232+ and r0, r0, 0x18
233+ lsr r1, r1, r0
234+ lsr r2, r2, r0
235+ bmsk_s r1, r1, 7
236+ bmsk_s r2, r2, 7
237+ j_s.d [blink]
238+ sub r0, r1, r2
239+
240+#else
241+#error "Unsupported ARC CPU type"
242+#endif
243+
152244 END(memcmp)
153245 libc_hidden_def(memcmp)
154246
--- a/libc/string/arc/memcpy.S
+++ b/libc/string/arc/memcpy.S
@@ -1,5 +1,5 @@
11 /*
2- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
2+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
33 * Copyright (C) 2007 ARC International (UK) LTD
44 *
55 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,13 +7,9 @@
77
88 #include <sysdep.h>
99
10-#if !defined(__ARC700__) && !defined(__ARCHS__)
11-#error "Neither ARC700 nor ARCHS is defined!"
12-#endif
13-
1410 ENTRY(memcpy)
1511
16-#ifdef __ARC700__
12+#if defined(__ARC700__)
1713 /* This memcpy implementation does not support objects of 1GB or larger -
1814 the check for alignment does not work then. */
1915 /* We assume that most sources and destinations are aligned, and
@@ -73,9 +69,9 @@ ENTRY(memcpy)
7369 .Lendbloop:
7470 j_s.d [blink]
7571 stb r12,[r5,0]
76-#endif /* __ARC700__ */
7772
78-#ifdef __ARCHS__
73+#elif defined(__ARCHS__)
74+
7975 #ifdef __LITTLE_ENDIAN__
8076 # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; <<
8177 # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >>
@@ -299,7 +295,58 @@ ENTRY(memcpy)
299295 stb.ab r6, [r3,1]
300296 .Lcopybytewise_3:
301297 j [blink]
302-#endif /* __ARCHS__ */
298+
299+#elif defined(__ARC64_ARCH32__)
300+ ;; Based on Synopsys code from newlib's arc64/memcpy.S
301+ lsr.f r11, r2, 4 ; counter for 16-byte chunks
302+ beq.d @.L_write_15_bytes
303+ mov r3, r0 ; work on a copy of "r0"
304+
305+.L_write_16_bytes:
306+#if defined(__ARC64_LL64__)
307+ ldd.ab r4, [r1, 8]
308+ ldd.ab r6, [r1, 8]
309+ std.ab r4, [r3, 8]
310+ std.ab r6, [r3, 8]
311+ dbnz r11, @.L_write_16_bytes
312+#else
313+ ld.ab r4, [r1, 4]
314+ ld.ab r5, [r1, 4]
315+ ld.ab r6, [r1, 4]
316+ ld.ab r7, [r1, 4]
317+ st.ab r4, [r3, 4]
318+ st.ab r5, [r3, 4]
319+ st.ab r6, [r3, 4]
320+ dbnz.d r11, @.L_write_16_bytes
321+ st.ab r7, [r3, 4]
322+#endif
323+ bmsk_s r2, r2, 3
324+
325+.L_write_15_bytes:
326+ bbit0.d r2, 1, @1f
327+ lsr r11, r2, 2
328+ ldh.ab r4, [r1, 2]
329+ sth.ab r4, [r3, 2]
330+1:
331+ bbit0.d r2, 0, @1f
332+ xor r11, r11, 3
333+ ldb.ab r4, [r1, 1]
334+ stb.ab r4, [r3, 1]
335+1:
336+ asl r11, r11, 1
337+ bi [r11]
338+ ld.ab r4,[r1, 4]
339+ st.ab r4,[r3, 4]
340+ ld.ab r4,[r1, 4]
341+ st.ab r4,[r3, 4]
342+ ld r4,[r1]
343+ st r4,[r3]
344+
345+ j_s [blink]
346+
347+#else
348+#error "Unsupported ARC CPU type"
349+#endif
303350
304351 END(memcpy)
305352 libc_hidden_def(memcpy)
--- a/libc/string/arc/memset.S
+++ b/libc/string/arc/memset.S
@@ -1,5 +1,5 @@
11 /*
2- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
2+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
33 * Copyright (C) 2007 ARC International (UK) LTD
44 *
55 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,13 +7,9 @@
77
88 #include <sysdep.h>
99
10-#if !defined(__ARC700__) && !defined(__ARCHS__)
11-#error "Neither ARC700 nor ARCHS is defined!"
12-#endif
13-
1410 ENTRY(memset)
1511
16-#ifdef __ARC700__
12+#if defined(__ARC700__)
1713 #define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */
1814
1915 mov_s r4,r0
@@ -52,9 +48,8 @@ ENTRY(memset)
5248 stb.ab r1,[r4,1]
5349 .Ltiny_end:
5450 j_s [blink]
55-#endif /* __ARC700__ */
5651
57-#ifdef __ARCHS__
52+#elif defined(__ARCHS__)
5853 #ifdef DONT_USE_PREALLOC
5954 #define PREWRITE(A,B) prefetchw [(A),(B)]
6055 #else
@@ -156,7 +151,55 @@ ENTRY(memset)
156151 .Lcopy3bytes:
157152
158153 j [blink]
159-#endif /* __ARCHS__ */
154+
155+#elif defined(__ARC64_ARCH32__)
156+ ;; Based on Synopsys code from newlib's arc64/memset.S
157+
158+ ;; Assemble the bytes to 32bit words
159+ bmsk_s r1, r1, 7 ; treat it like unsigned char
160+ lsl8 r3, r1
161+ or_s r1, r1, r3
162+ lsl16 r3, r1
163+ or r6, r1, r3
164+ mov r7,r6
165+
166+ lsr.f r5, r2, 4 ; counter for 16-byte chunks
167+ beq.d @.L_write_15_bytes
168+ mov r4, r0 ; work on a copy of "r0"
169+
170+.L_write_16_bytes:
171+#if defined(__ARC64_LL64__)
172+ std.ab r6, [r4, 8]
173+ std.ab r6, [r4, 8]
174+ dbnz r5, @.L_write_16_bytes
175+#else
176+ st.ab r6, [r4, 4]
177+ st.ab r6, [r4, 4]
178+ st.ab r6, [r4, 4]
179+ dbnz.d r5, @.L_write_16_bytes
180+ st.ab r6, [r4, 4]
181+#endif
182+ bmsk_s r2, r2, 3
183+
184+.L_write_15_bytes:
185+ bbit0.d r2, 1, @1f
186+ lsr r3, r2, 2
187+ sth.ab r6, [r4, 2]
188+1:
189+ bbit0.d r2, 0, @1f
190+ xor r3, r3, 3
191+ stb.ab r6, [r4, 1]
192+1:
193+ bi [r3]
194+ st.ab r6,[r4, 4]
195+ st.ab r6,[r4, 4]
196+ st.ab r6,[r4, 4]
197+
198+ j_s [blink]
199+
200+#else
201+#error "Unsupported ARC CPU type"
202+#endif
160203
161204 END(memset)
162205 libc_hidden_def(memset)
--- a/libc/string/arc/strchr.S
+++ b/libc/string/arc/strchr.S
@@ -1,5 +1,5 @@
11 /*
2- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
2+ * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
33 * Copyright (C) 2007 ARC International (UK) LTD
44 *
55 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,6 +7,7 @@
77
88 #include <sysdep.h>
99 #include <features.h>
10+#include <asm.h>
1011
1112 /* ARC700 has a relatively long pipeline and branch prediction, so we want
1213 to avoid branches that are hard to predict. On the other hand, the
@@ -21,7 +22,7 @@ ENTRY(strchr)
2122 mov_s r3,0x01010101
2223 breq.d r2,r0,.Laligned
2324 asl r4,r5,16
24- sub_s r0,r0,r2
25+ SUBR_S r0,r0,r2
2526 asl r7,r2,3
2627 ld_s r2,[r0]
2728 #ifdef __LITTLE_ENDIAN__
@@ -77,10 +78,10 @@ ENTRY(strchr)
7778 sub r3,r7,1
7879 bic r3,r3,r7
7980 norm r2,r3
80- sub_s r0,r0,1
81- asr_s r2,r2,3
81+ SUBR_S r0,r0,1
82+ ASRR_S r2,r2,3
8283 j.d [blink]
83- sub_s r0,r0,r2
84+ SUBR_S r0,r0,r2
8485
8586 .balign 4
8687 .Lfound0_ua:
@@ -90,13 +91,13 @@ ENTRY(strchr)
9091 bic r3,r3,r6
9192 and r2,r3,r4
9293 or_s r12,r12,r2
93- sub_s r3,r12,1
94+ SUBR_S r3,r12,1
9495 bic_s r3,r3,r12
9596 norm r3,r3
96- add_s r0,r0,3
97- asr_s r12,r3,3
97+ ADDR_S r0,r0,3
98+ ASRR_S r12,r3,3
9899 asl.f 0,r2,r3
99- sub_s r0,r0,r12
100+ SUBR_S r0,r0,r12
100101 j_s.d [blink]
101102 mov.pl r0,0
102103 #else /* BIG ENDIAN */
@@ -106,10 +107,10 @@ ENTRY(strchr)
106107 bic r2,r7,r6
107108 .Lfound_char_b:
108109 norm r2,r2
109- sub_s r0,r0,4
110+ SUBR_S r0,r0,4
110111 asr_s r2,r2,3
111112 j.d [blink]
112- add_s r0,r0,r2
113+ ADDR_S r0,r0,r2
113114
114115 .Lfound0_ua:
115116 mov_s r3,r7
@@ -126,7 +127,7 @@ ENTRY(strchr)
126127 add.pl r3,r3,1
127128 asr_s r12,r3,3
128129 asl.f 0,r2,r3
129- add_s r0,r0,r12
130+ ADDR_S r0,r0,r12
130131 j_s.d [blink]
131132 mov.mi r0,0
132133 #endif /* ENDIAN */
--- a/libc/string/arc/strcmp.S
+++ b/libc/string/arc/strcmp.S
@@ -1,5 +1,5 @@
11 /*
2- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
2+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
33 * Copyright (C) 2007 ARC International (UK) LTD
44 *
55 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,14 +7,11 @@
77
88 #include <features.h>
99 #include <sysdep.h>
10-
11-#if !defined(__ARC700__) && !defined(__ARCHS__)
12-#error "Neither ARC700 nor ARCHS is defined!"
13-#endif
10+#include <asm.h>
1411
1512 ENTRY(strcmp)
1613
17-#ifdef __ARC700__
14+#if defined(__ARC700__) || defined(__ARC64_ARCH32__)
1815 /* This is optimized primarily for the ARC700.
1916 It would be possible to speed up the loops by one cycle / word
2017 respective one cycle / byte by forcing double source 1 alignment, unrolling
@@ -38,7 +35,7 @@ ENTRY(strcmp)
3835 breq r2,r3,.Lwordloop
3936 #ifdef __LITTLE_ENDIAN__
4037 xor r0,r2,r3 ; mask for difference
41- sub_s r1,r0,1
38+ SUBR_S r1,r0,1
4239 bic_s r0,r0,r1 ; mask for least significant difference bit
4340 sub r1,r5,r0
4441 xor r0,r5,r1 ; mask for least significant difference byte
@@ -55,7 +52,7 @@ ENTRY(strcmp)
5552 .Lfound0:
5653 xor r0,r2,r3 ; mask for difference
5754 or r0,r0,r4 ; or in zero indicator
58- sub_s r1,r0,1
55+ SUBR_S r1,r0,1
5956 bic_s r0,r0,r1 ; mask for least significant difference bit
6057 sub r1,r5,r0
6158 xor r0,r5,r1 ; mask for least significant difference byte
@@ -99,9 +96,8 @@ ENTRY(strcmp)
9996 .Lcmpend:
10097 j_s.d [blink]
10198 sub r0,r2,r3
102-#endif /* __ARC700__ */
10399
104-#ifdef __ARCHS__
100+#elif defined(__ARCHS__)
105101 or r2, r0, r1
106102 bmsk_s r2, r2, 1
107103 brne r2, 0, @.Lcharloop
@@ -168,7 +164,10 @@ ENTRY(strcmp)
168164 .Lcmpend:
169165 j_s.d [blink]
170166 sub r0, r2, r3
171-#endif /* __ARCHS__ */
167+
168+#else
169+#error "Unsupported ARC CPU type"
170+#endif
172171
173172 END(strcmp)
174173 libc_hidden_def(strcmp)
--- a/libc/string/arc/strlen.S
+++ b/libc/string/arc/strlen.S
@@ -1,5 +1,5 @@
11 /*
2- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
2+ * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
33 * Copyright (C) 2007 ARC International (UK) LTD
44 *
55 * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,6 +7,7 @@
77
88
99 #include <sysdep.h>
10+#include <asm.h>
1011
1112 ENTRY(strlen)
1213 or r3,r0,7
@@ -15,7 +16,7 @@ ENTRY(strlen)
1516 mov r4,0x01010101
1617 ; uses long immediate
1718 #ifdef __LITTLE_ENDIAN__
18- asl_s r1,r0,3
19+ ASLR_S r1,r0,3
1920 btst_s r0,2
2021 asl r7,r4,r1
2122 ror r5,r4
@@ -59,7 +60,7 @@ ENTRY(strlen)
5960 sub.ne r3,r3,4
6061 mov.eq r1,r12
6162 #ifdef __LITTLE_ENDIAN__
62- sub_s r2,r1,1
63+ SUBR_S r2,r1,1
6364 bic_s r2,r2,r1
6465 norm r1,r2
6566 sub_s r0,r0,3
--- a/libc/sysdeps/linux/arc/asm.h
+++ b/libc/sysdeps/linux/arc/asm.h
@@ -7,6 +7,13 @@
77 #ifndef _ARC_ASM_H
88 #define _ARC_ASM_H
99
10+/*
11+ * Some 16-bit instructions were excluded from the ARCv3 ISA
12+ * the following macros are introduced to handle these changes in one place.
13+ * This will allow not to change existing ARCv2 code and use 16-bit versions
14+ * of instructions for ARCv2 and replace them with 32-bit vesrions for ARCv3
15+ */
16+
1017 #if defined (__ARC64_ARCH32__)
1118
1219 .macro PUSHR reg
@@ -25,6 +32,22 @@
2532 pop \reg
2633 .endm
2734
35+.macro SUBR_S dst,src1,src2
36+ sub \dst, \src1, \src2
37+.endm
38+
39+.macro ADDR_S dst,src1,src2
40+ add \dst, \src1, \src2
41+.endm
42+
43+.macro ASRR_S dst,src1,src2
44+ asr \dst, \src1, \src2
45+.endm
46+
47+.macro ASLR_S dst,src1,src2
48+ asl \dst, \src1, \src2
49+.endm
50+
2851 #elif defined (__ARC64_ARCH64__)
2952
3053 # error ARCv3 64-bit is not supported by uClibc-ng
@@ -47,6 +70,22 @@
4770 pop_s \reg
4871 .endm
4972
73+.macro SUBR_S dst,src1,src2
74+ sub_s \dst, \src1, \src2
75+.endm
76+
77+.macro ADDR_S dst,src1,src2
78+ add_s \dst, \src1, \src2
79+.endm
80+
81+.macro ASRR_S dst,src1,src2
82+ asr_s \dst, \src1, \src2
83+.endm
84+
85+.macro ASLR_S dst,src1,src2
86+ asl_s \dst, \src1, \src2
87+.endm
88+
5089 #endif
5190
5291 #endif /* _ARC_ASM_H */