1/* strspn with SSE4.2 intrinsics
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <nmmintrin.h>
21#include <string.h>
22#include "varshift.h"
23
24/* We use 0x12:
25 _SIDD_SBYTE_OPS
26 | _SIDD_CMP_EQUAL_ANY
27 | _SIDD_NEGATIVE_POLARITY
28 | _SIDD_LEAST_SIGNIFICANT
29 on pcmpistri to compare xmm/mem128
30
31 0 1 2 3 4 5 6 7 8 9 A B C D E F
32 X X X X X X X X X X X X X X X X
33
34 against xmm
35
36 0 1 2 3 4 5 6 7 8 9 A B C D E F
37 A A A A A A A A A A A A A A A A
38
39 to find out if the first 16byte data element has any non-A byte and
40 the offset of the first byte. There are 2 cases:
41
42 1. The first 16byte data element has the non-A byte, including
43 EOS, at the offset X.
44 2. The first 16byte data element is valid and doesn't have the non-A
45 byte.
46
47 Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
48
49 case ECX CFlag ZFlag SFlag
50 1 X 1 0/1 0
51 2 16 0 0 0
52
53 We exit from the loop for case 1. */
54
55extern size_t __strspn_sse2 (const char *, const char *) attribute_hidden;
56
57
58size_t
59__attribute__ ((section (".text.sse4.2")))
60__strspn_sse42 (const char *s, const char *a)
61{
62 if (*a == 0)
63 return 0;
64
65 const char *aligned;
66 __m128i mask;
67 int offset = (int) ((size_t) a & 15);
68 if (offset != 0)
69 {
70 /* Load masks. */
71 aligned = (const char *) ((size_t) a & -16L);
72 __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
73
74 mask = __m128i_shift_right (mask0, offset);
75
76 /* Find where the NULL terminator is. */
77 int length = _mm_cmpistri (mask, mask, 0x3a);
78 if (length == 16 - offset)
79 {
80 /* There is no NULL terminator. */
81 __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
82 int index = _mm_cmpistri (mask1, mask1, 0x3a);
83 length += index;
84
85 /* Don't use SSE4.2 if the length of A > 16. */
86 if (length > 16)
87 return __strspn_sse2 (s, a);
88
89 if (index != 0)
90 {
91 /* Combine mask0 and mask1. We could play games with
92 palignr, but frankly this data should be in L1 now
93 so do the merge via an unaligned load. */
94 mask = _mm_loadu_si128 ((__m128i *) a);
95 }
96 }
97 }
98 else
99 {
100 /* A is aligned. */
101 mask = _mm_load_si128 ((__m128i *) a);
102
103 /* Find where the NULL terminator is. */
104 int length = _mm_cmpistri (mask, mask, 0x3a);
105 if (length == 16)
106 {
107 /* There is no NULL terminator. Don't use SSE4.2 if the length
108 of A > 16. */
109 if (a[16] != 0)
110 return __strspn_sse2 (s, a);
111 }
112 }
113
114 offset = (int) ((size_t) s & 15);
115 if (offset != 0)
116 {
117 /* Check partial string. */
118 aligned = (const char *) ((size_t) s & -16L);
119 __m128i value = _mm_load_si128 ((__m128i *) aligned);
120
121 value = __m128i_shift_right (value, offset);
122
123 int length = _mm_cmpistri (mask, value, 0x12);
124 /* No need to check CFlag since it is always 1. */
125 if (length < 16 - offset)
126 return length;
127 /* Find where the NULL terminator is. */
128 int index = _mm_cmpistri (value, value, 0x3a);
129 if (index < 16 - offset)
130 return length;
131 aligned += 16;
132 }
133 else
134 aligned = s;
135
136 while (1)
137 {
138 __m128i value = _mm_load_si128 ((__m128i *) aligned);
139 int index = _mm_cmpistri (mask, value, 0x12);
140 int cflag = _mm_cmpistrc (mask, value, 0x12);
141 if (cflag)
142 return (size_t) (aligned + index - s);
143 aligned += 16;
144 }
145}
146