1/* memcpy with SSSE3
2 Copyright (C) 2010-2020 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22#if IS_IN (libc)
23
24#include "asm-syntax.h"
25
26#ifndef MEMCPY
27# define MEMCPY __memcpy_ssse3
28# define MEMCPY_CHK __memcpy_chk_ssse3
29# define MEMPCPY __mempcpy_ssse3
30# define MEMPCPY_CHK __mempcpy_chk_ssse3
31#endif
32
33#define JMPTBL(I, B) I - B
34
35/* Branch to an entry in a jump table. TABLE is a jump table with
36 relative offsets. INDEX is a register contains the index into the
37 jump table. SCALE is the scale of INDEX. */
38#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
39 lea TABLE(%rip), %r11; \
40 movslq (%r11, INDEX, SCALE), INDEX; \
41 lea (%r11, INDEX), INDEX; \
42 _CET_NOTRACK jmp *INDEX; \
43 ud2
44
45 .section .text.ssse3,"ax",@progbits
46#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
47ENTRY (MEMPCPY_CHK)
48 cmp %RDX_LP, %RCX_LP
49 jb HIDDEN_JUMPTARGET (__chk_fail)
50END (MEMPCPY_CHK)
51
52ENTRY (MEMPCPY)
53 mov %RDI_LP, %RAX_LP
54 add %RDX_LP, %RAX_LP
55 jmp L(start)
56END (MEMPCPY)
57#endif
58
59#if !defined USE_AS_BCOPY
60ENTRY (MEMCPY_CHK)
61 cmp %RDX_LP, %RCX_LP
62 jb HIDDEN_JUMPTARGET (__chk_fail)
63END (MEMCPY_CHK)
64#endif
65
66ENTRY (MEMCPY)
67 mov %RDI_LP, %RAX_LP
68#ifdef USE_AS_MEMPCPY
69 add %RDX_LP, %RAX_LP
70#endif
71
72#ifdef __ILP32__
73 /* Clear the upper 32 bits. */
74 mov %edx, %edx
75#endif
76
77#ifdef USE_AS_MEMMOVE
78 cmp %rsi, %rdi
79 jb L(copy_forward)
80 je L(write_0bytes)
81 cmp $79, %rdx
82 jbe L(copy_forward)
83 jmp L(copy_backward)
84L(copy_forward):
85#endif
86L(start):
87 cmp $79, %rdx
88 lea L(table_less_80bytes)(%rip), %r11
89 ja L(80bytesormore)
90 movslq (%r11, %rdx, 4), %r9
91 add %rdx, %rsi
92 add %rdx, %rdi
93 add %r11, %r9
94 _CET_NOTRACK jmp *%r9
95 ud2
96
97 .p2align 4
98L(80bytesormore):
99#ifndef USE_AS_MEMMOVE
100 cmp %dil, %sil
101 jle L(copy_backward)
102#endif
103
104 movdqu (%rsi), %xmm0
105 mov %rdi, %rcx
106 and $-16, %rdi
107 add $16, %rdi
108 mov %rcx, %r8
109 sub %rdi, %rcx
110 add %rcx, %rdx
111 sub %rcx, %rsi
112
113#ifdef SHARED_CACHE_SIZE_HALF
114 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
115#else
116 mov __x86_shared_cache_size_half(%rip), %RCX_LP
117#endif
118 cmp %rcx, %rdx
119 mov %rsi, %r9
120 ja L(large_page_fwd)
121 and $0xf, %r9
122 jz L(shl_0)
123#ifdef DATA_CACHE_SIZE_HALF
124 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
125#else
126 mov __x86_data_cache_size_half(%rip), %RCX_LP
127#endif
128 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
129
130 .p2align 4
131L(copy_backward):
132 movdqu -16(%rsi, %rdx), %xmm0
133 add %rdx, %rsi
134 lea -16(%rdi, %rdx), %r8
135 add %rdx, %rdi
136
137 mov %rdi, %rcx
138 and $0xf, %rcx
139 xor %rcx, %rdi
140 sub %rcx, %rdx
141 sub %rcx, %rsi
142
143#ifdef SHARED_CACHE_SIZE_HALF
144 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
145#else
146 mov __x86_shared_cache_size_half(%rip), %RCX_LP
147#endif
148
149 cmp %rcx, %rdx
150 mov %rsi, %r9
151 ja L(large_page_bwd)
152 and $0xf, %r9
153 jz L(shl_0_bwd)
154#ifdef DATA_CACHE_SIZE_HALF
155 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
156#else
157 mov __x86_data_cache_size_half(%rip), %RCX_LP
158#endif
159 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
160
161 .p2align 4
162L(shl_0):
163 sub $16, %rdx
164 movdqa (%rsi), %xmm1
165 add $16, %rsi
166 movdqa %xmm1, (%rdi)
167 add $16, %rdi
168 cmp $128, %rdx
169 movdqu %xmm0, (%r8)
170 ja L(shl_0_gobble)
171 cmp $64, %rdx
172 jb L(shl_0_less_64bytes)
173 movaps (%rsi), %xmm4
174 movaps 16(%rsi), %xmm1
175 movaps 32(%rsi), %xmm2
176 movaps 48(%rsi), %xmm3
177 movaps %xmm4, (%rdi)
178 movaps %xmm1, 16(%rdi)
179 movaps %xmm2, 32(%rdi)
180 movaps %xmm3, 48(%rdi)
181 sub $64, %rdx
182 add $64, %rsi
183 add $64, %rdi
184L(shl_0_less_64bytes):
185 add %rdx, %rsi
186 add %rdx, %rdi
187 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
188
189 .p2align 4
190L(shl_0_gobble):
191#ifdef DATA_CACHE_SIZE_HALF
192 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
193#else
194 cmp __x86_data_cache_size_half(%rip), %RDX_LP
195#endif
196 lea -128(%rdx), %rdx
197 jae L(shl_0_gobble_mem_loop)
198L(shl_0_gobble_cache_loop):
199 movdqa (%rsi), %xmm4
200 movaps 0x10(%rsi), %xmm1
201 movaps 0x20(%rsi), %xmm2
202 movaps 0x30(%rsi), %xmm3
203
204 movdqa %xmm4, (%rdi)
205 movaps %xmm1, 0x10(%rdi)
206 movaps %xmm2, 0x20(%rdi)
207 movaps %xmm3, 0x30(%rdi)
208
209 sub $128, %rdx
210 movaps 0x40(%rsi), %xmm4
211 movaps 0x50(%rsi), %xmm5
212 movaps 0x60(%rsi), %xmm6
213 movaps 0x70(%rsi), %xmm7
214 lea 0x80(%rsi), %rsi
215 movaps %xmm4, 0x40(%rdi)
216 movaps %xmm5, 0x50(%rdi)
217 movaps %xmm6, 0x60(%rdi)
218 movaps %xmm7, 0x70(%rdi)
219 lea 0x80(%rdi), %rdi
220
221 jae L(shl_0_gobble_cache_loop)
222 cmp $-0x40, %rdx
223 lea 0x80(%rdx), %rdx
224 jl L(shl_0_cache_less_64bytes)
225
226 movdqa (%rsi), %xmm4
227 sub $0x40, %rdx
228 movdqa 0x10(%rsi), %xmm1
229
230 movdqa %xmm4, (%rdi)
231 movdqa %xmm1, 0x10(%rdi)
232
233 movdqa 0x20(%rsi), %xmm4
234 movdqa 0x30(%rsi), %xmm1
235 add $0x40, %rsi
236
237 movdqa %xmm4, 0x20(%rdi)
238 movdqa %xmm1, 0x30(%rdi)
239 add $0x40, %rdi
240L(shl_0_cache_less_64bytes):
241 add %rdx, %rsi
242 add %rdx, %rdi
243 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
244
245 .p2align 4
246L(shl_0_gobble_mem_loop):
247 prefetcht0 0x1c0(%rsi)
248 prefetcht0 0x280(%rsi)
249
250 movdqa (%rsi), %xmm0
251 movdqa 0x10(%rsi), %xmm1
252 movdqa 0x20(%rsi), %xmm2
253 movdqa 0x30(%rsi), %xmm3
254 movdqa 0x40(%rsi), %xmm4
255 movdqa 0x50(%rsi), %xmm5
256 movdqa 0x60(%rsi), %xmm6
257 movdqa 0x70(%rsi), %xmm7
258 lea 0x80(%rsi), %rsi
259 sub $0x80, %rdx
260 movdqa %xmm0, (%rdi)
261 movdqa %xmm1, 0x10(%rdi)
262 movdqa %xmm2, 0x20(%rdi)
263 movdqa %xmm3, 0x30(%rdi)
264 movdqa %xmm4, 0x40(%rdi)
265 movdqa %xmm5, 0x50(%rdi)
266 movdqa %xmm6, 0x60(%rdi)
267 movdqa %xmm7, 0x70(%rdi)
268 lea 0x80(%rdi), %rdi
269
270 jae L(shl_0_gobble_mem_loop)
271 cmp $-0x40, %rdx
272 lea 0x80(%rdx), %rdx
273 jl L(shl_0_mem_less_64bytes)
274
275 movdqa (%rsi), %xmm0
276 sub $0x40, %rdx
277 movdqa 0x10(%rsi), %xmm1
278
279 movdqa %xmm0, (%rdi)
280 movdqa %xmm1, 0x10(%rdi)
281
282 movdqa 0x20(%rsi), %xmm0
283 movdqa 0x30(%rsi), %xmm1
284 add $0x40, %rsi
285
286 movdqa %xmm0, 0x20(%rdi)
287 movdqa %xmm1, 0x30(%rdi)
288 add $0x40, %rdi
289L(shl_0_mem_less_64bytes):
290 cmp $0x20, %rdx
291 jb L(shl_0_mem_less_32bytes)
292 movdqa (%rsi), %xmm0
293 sub $0x20, %rdx
294 movdqa 0x10(%rsi), %xmm1
295 add $0x20, %rsi
296 movdqa %xmm0, (%rdi)
297 movdqa %xmm1, 0x10(%rdi)
298 add $0x20, %rdi
299L(shl_0_mem_less_32bytes):
300 add %rdx, %rdi
301 add %rdx, %rsi
302 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
303
304 .p2align 4
305L(shl_0_bwd):
306 sub $16, %rdx
307 movdqa -0x10(%rsi), %xmm1
308 sub $16, %rsi
309 movdqa %xmm1, -0x10(%rdi)
310 sub $16, %rdi
311 cmp $0x80, %rdx
312 movdqu %xmm0, (%r8)
313 ja L(shl_0_gobble_bwd)
314 cmp $64, %rdx
315 jb L(shl_0_less_64bytes_bwd)
316 movaps -0x10(%rsi), %xmm0
317 movaps -0x20(%rsi), %xmm1
318 movaps -0x30(%rsi), %xmm2
319 movaps -0x40(%rsi), %xmm3
320 movaps %xmm0, -0x10(%rdi)
321 movaps %xmm1, -0x20(%rdi)
322 movaps %xmm2, -0x30(%rdi)
323 movaps %xmm3, -0x40(%rdi)
324 sub $64, %rdx
325 sub $0x40, %rsi
326 sub $0x40, %rdi
327L(shl_0_less_64bytes_bwd):
328 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
329
330 .p2align 4
331L(shl_0_gobble_bwd):
332#ifdef DATA_CACHE_SIZE_HALF
333 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
334#else
335 cmp __x86_data_cache_size_half(%rip), %RDX_LP
336#endif
337 lea -128(%rdx), %rdx
338 jae L(shl_0_gobble_mem_bwd_loop)
339L(shl_0_gobble_bwd_loop):
340 movdqa -0x10(%rsi), %xmm0
341 movaps -0x20(%rsi), %xmm1
342 movaps -0x30(%rsi), %xmm2
343 movaps -0x40(%rsi), %xmm3
344
345 movdqa %xmm0, -0x10(%rdi)
346 movaps %xmm1, -0x20(%rdi)
347 movaps %xmm2, -0x30(%rdi)
348 movaps %xmm3, -0x40(%rdi)
349
350 sub $0x80, %rdx
351 movaps -0x50(%rsi), %xmm4
352 movaps -0x60(%rsi), %xmm5
353 movaps -0x70(%rsi), %xmm6
354 movaps -0x80(%rsi), %xmm7
355 lea -0x80(%rsi), %rsi
356 movaps %xmm4, -0x50(%rdi)
357 movaps %xmm5, -0x60(%rdi)
358 movaps %xmm6, -0x70(%rdi)
359 movaps %xmm7, -0x80(%rdi)
360 lea -0x80(%rdi), %rdi
361
362 jae L(shl_0_gobble_bwd_loop)
363 cmp $-0x40, %rdx
364 lea 0x80(%rdx), %rdx
365 jl L(shl_0_gobble_bwd_less_64bytes)
366
367 movdqa -0x10(%rsi), %xmm0
368 sub $0x40, %rdx
369 movdqa -0x20(%rsi), %xmm1
370
371 movdqa %xmm0, -0x10(%rdi)
372 movdqa %xmm1, -0x20(%rdi)
373
374 movdqa -0x30(%rsi), %xmm0
375 movdqa -0x40(%rsi), %xmm1
376 sub $0x40, %rsi
377
378 movdqa %xmm0, -0x30(%rdi)
379 movdqa %xmm1, -0x40(%rdi)
380 sub $0x40, %rdi
381L(shl_0_gobble_bwd_less_64bytes):
382 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
383
384 .p2align 4
385L(shl_0_gobble_mem_bwd_loop):
386 prefetcht0 -0x1c0(%rsi)
387 prefetcht0 -0x280(%rsi)
388 movdqa -0x10(%rsi), %xmm0
389 movdqa -0x20(%rsi), %xmm1
390 movdqa -0x30(%rsi), %xmm2
391 movdqa -0x40(%rsi), %xmm3
392 movdqa -0x50(%rsi), %xmm4
393 movdqa -0x60(%rsi), %xmm5
394 movdqa -0x70(%rsi), %xmm6
395 movdqa -0x80(%rsi), %xmm7
396 lea -0x80(%rsi), %rsi
397 sub $0x80, %rdx
398 movdqa %xmm0, -0x10(%rdi)
399 movdqa %xmm1, -0x20(%rdi)
400 movdqa %xmm2, -0x30(%rdi)
401 movdqa %xmm3, -0x40(%rdi)
402 movdqa %xmm4, -0x50(%rdi)
403 movdqa %xmm5, -0x60(%rdi)
404 movdqa %xmm6, -0x70(%rdi)
405 movdqa %xmm7, -0x80(%rdi)
406 lea -0x80(%rdi), %rdi
407
408 jae L(shl_0_gobble_mem_bwd_loop)
409 cmp $-0x40, %rdx
410 lea 0x80(%rdx), %rdx
411 jl L(shl_0_mem_bwd_less_64bytes)
412
413 movdqa -0x10(%rsi), %xmm0
414 sub $0x40, %rdx
415 movdqa -0x20(%rsi), %xmm1
416
417 movdqa %xmm0, -0x10(%rdi)
418 movdqa %xmm1, -0x20(%rdi)
419
420 movdqa -0x30(%rsi), %xmm0
421 movdqa -0x40(%rsi), %xmm1
422 sub $0x40, %rsi
423
424 movdqa %xmm0, -0x30(%rdi)
425 movdqa %xmm1, -0x40(%rdi)
426 sub $0x40, %rdi
427L(shl_0_mem_bwd_less_64bytes):
428 cmp $0x20, %rdx
429 jb L(shl_0_mem_bwd_less_32bytes)
430 movdqa -0x10(%rsi), %xmm0
431 sub $0x20, %rdx
432 movdqa -0x20(%rsi), %xmm1
433 sub $0x20, %rsi
434 movdqa %xmm0, -0x10(%rdi)
435 movdqa %xmm1, -0x20(%rdi)
436 sub $0x20, %rdi
437L(shl_0_mem_bwd_less_32bytes):
438 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
439
440 .p2align 4
441L(shl_1):
442 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
443 cmp %rcx, %rdx
444 movaps -0x01(%rsi), %xmm1
445 jb L(L1_fwd)
446 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
447L(L1_fwd):
448 lea -64(%rdx), %rdx
449 _CET_NOTRACK jmp *%r9
450 ud2
451L(shl_1_loop_L2):
452 prefetchnta 0x1c0(%rsi)
453L(shl_1_loop_L1):
454 sub $64, %rdx
455 movaps 0x0f(%rsi), %xmm2
456 movaps 0x1f(%rsi), %xmm3
457 movaps 0x2f(%rsi), %xmm4
458 movaps 0x3f(%rsi), %xmm5
459 movdqa %xmm5, %xmm6
460 palignr $1, %xmm4, %xmm5
461 lea 64(%rsi), %rsi
462 palignr $1, %xmm3, %xmm4
463 palignr $1, %xmm2, %xmm3
464 lea 64(%rdi), %rdi
465 palignr $1, %xmm1, %xmm2
466 movdqa %xmm6, %xmm1
467 movdqa %xmm2, -0x40(%rdi)
468 movaps %xmm3, -0x30(%rdi)
469 jb L(shl_1_end)
470 movaps %xmm4, -0x20(%rdi)
471 movaps %xmm5, -0x10(%rdi)
472 _CET_NOTRACK jmp *%r9
473 ud2
474L(shl_1_end):
475 movaps %xmm4, -0x20(%rdi)
476 lea 64(%rdx), %rdx
477 movaps %xmm5, -0x10(%rdi)
478 add %rdx, %rdi
479 movdqu %xmm0, (%r8)
480 add %rdx, %rsi
481 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
482
483 .p2align 4
484L(shl_1_bwd):
485 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
486 cmp %rcx, %rdx
487 movaps -0x01(%rsi), %xmm1
488 jb L(L1_bwd)
489 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
490L(L1_bwd):
491 lea -64(%rdx), %rdx
492 _CET_NOTRACK jmp *%r9
493 ud2
494L(shl_1_bwd_loop_L2):
495 prefetchnta -0x1c0(%rsi)
496L(shl_1_bwd_loop_L1):
497 movaps -0x11(%rsi), %xmm2
498 sub $0x40, %rdx
499 movaps -0x21(%rsi), %xmm3
500 movaps -0x31(%rsi), %xmm4
501 movaps -0x41(%rsi), %xmm5
502 lea -0x40(%rsi), %rsi
503 palignr $1, %xmm2, %xmm1
504 palignr $1, %xmm3, %xmm2
505 palignr $1, %xmm4, %xmm3
506 palignr $1, %xmm5, %xmm4
507
508 movaps %xmm1, -0x10(%rdi)
509 movaps %xmm5, %xmm1
510
511 movaps %xmm2, -0x20(%rdi)
512 lea -0x40(%rdi), %rdi
513
514 movaps %xmm3, 0x10(%rdi)
515 jb L(shl_1_bwd_end)
516 movaps %xmm4, (%rdi)
517 _CET_NOTRACK jmp *%r9
518 ud2
519L(shl_1_bwd_end):
520 movaps %xmm4, (%rdi)
521 lea 64(%rdx), %rdx
522 movdqu %xmm0, (%r8)
523 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
524
525 .p2align 4
526L(shl_2):
527 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
528 cmp %rcx, %rdx
529 movaps -0x02(%rsi), %xmm1
530 jb L(L2_fwd)
531 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
532L(L2_fwd):
533 lea -64(%rdx), %rdx
534 _CET_NOTRACK jmp *%r9
535 ud2
536L(shl_2_loop_L2):
537 prefetchnta 0x1c0(%rsi)
538L(shl_2_loop_L1):
539 sub $64, %rdx
540 movaps 0x0e(%rsi), %xmm2
541 movaps 0x1e(%rsi), %xmm3
542 movaps 0x2e(%rsi), %xmm4
543 movaps 0x3e(%rsi), %xmm5
544 movdqa %xmm5, %xmm6
545 palignr $2, %xmm4, %xmm5
546 lea 64(%rsi), %rsi
547 palignr $2, %xmm3, %xmm4
548 palignr $2, %xmm2, %xmm3
549 lea 64(%rdi), %rdi
550 palignr $2, %xmm1, %xmm2
551 movdqa %xmm6, %xmm1
552 movdqa %xmm2, -0x40(%rdi)
553 movaps %xmm3, -0x30(%rdi)
554 jb L(shl_2_end)
555 movaps %xmm4, -0x20(%rdi)
556 movaps %xmm5, -0x10(%rdi)
557 _CET_NOTRACK jmp *%r9
558 ud2
559L(shl_2_end):
560 movaps %xmm4, -0x20(%rdi)
561 lea 64(%rdx), %rdx
562 movaps %xmm5, -0x10(%rdi)
563 add %rdx, %rdi
564 movdqu %xmm0, (%r8)
565 add %rdx, %rsi
566 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
567
568 .p2align 4
569L(shl_2_bwd):
570 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
571 cmp %rcx, %rdx
572 movaps -0x02(%rsi), %xmm1
573 jb L(L2_bwd)
574 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
575L(L2_bwd):
576 lea -64(%rdx), %rdx
577 _CET_NOTRACK jmp *%r9
578 ud2
579L(shl_2_bwd_loop_L2):
580 prefetchnta -0x1c0(%rsi)
581L(shl_2_bwd_loop_L1):
582 movaps -0x12(%rsi), %xmm2
583 sub $0x40, %rdx
584 movaps -0x22(%rsi), %xmm3
585 movaps -0x32(%rsi), %xmm4
586 movaps -0x42(%rsi), %xmm5
587 lea -0x40(%rsi), %rsi
588 palignr $2, %xmm2, %xmm1
589 palignr $2, %xmm3, %xmm2
590 palignr $2, %xmm4, %xmm3
591 palignr $2, %xmm5, %xmm4
592
593 movaps %xmm1, -0x10(%rdi)
594 movaps %xmm5, %xmm1
595
596 movaps %xmm2, -0x20(%rdi)
597 lea -0x40(%rdi), %rdi
598
599 movaps %xmm3, 0x10(%rdi)
600 jb L(shl_2_bwd_end)
601 movaps %xmm4, (%rdi)
602 _CET_NOTRACK jmp *%r9
603 ud2
604L(shl_2_bwd_end):
605 movaps %xmm4, (%rdi)
606 lea 64(%rdx), %rdx
607 movdqu %xmm0, (%r8)
608 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
609
610 .p2align 4
611L(shl_3):
612 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
613 cmp %rcx, %rdx
614 movaps -0x03(%rsi), %xmm1
615 jb L(L3_fwd)
616 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
617L(L3_fwd):
618 lea -64(%rdx), %rdx
619 _CET_NOTRACK jmp *%r9
620 ud2
621L(shl_3_loop_L2):
622 prefetchnta 0x1c0(%rsi)
623L(shl_3_loop_L1):
624 sub $64, %rdx
625 movaps 0x0d(%rsi), %xmm2
626 movaps 0x1d(%rsi), %xmm3
627 movaps 0x2d(%rsi), %xmm4
628 movaps 0x3d(%rsi), %xmm5
629 movdqa %xmm5, %xmm6
630 palignr $3, %xmm4, %xmm5
631 lea 64(%rsi), %rsi
632 palignr $3, %xmm3, %xmm4
633 palignr $3, %xmm2, %xmm3
634 lea 64(%rdi), %rdi
635 palignr $3, %xmm1, %xmm2
636 movdqa %xmm6, %xmm1
637 movdqa %xmm2, -0x40(%rdi)
638 movaps %xmm3, -0x30(%rdi)
639 jb L(shl_3_end)
640 movaps %xmm4, -0x20(%rdi)
641 movaps %xmm5, -0x10(%rdi)
642 _CET_NOTRACK jmp *%r9
643 ud2
644L(shl_3_end):
645 movaps %xmm4, -0x20(%rdi)
646 lea 64(%rdx), %rdx
647 movaps %xmm5, -0x10(%rdi)
648 add %rdx, %rdi
649 movdqu %xmm0, (%r8)
650 add %rdx, %rsi
651 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
652
653 .p2align 4
654L(shl_3_bwd):
655 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
656 cmp %rcx, %rdx
657 movaps -0x03(%rsi), %xmm1
658 jb L(L3_bwd)
659 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
660L(L3_bwd):
661 lea -64(%rdx), %rdx
662 _CET_NOTRACK jmp *%r9
663 ud2
664L(shl_3_bwd_loop_L2):
665 prefetchnta -0x1c0(%rsi)
666L(shl_3_bwd_loop_L1):
667 movaps -0x13(%rsi), %xmm2
668 sub $0x40, %rdx
669 movaps -0x23(%rsi), %xmm3
670 movaps -0x33(%rsi), %xmm4
671 movaps -0x43(%rsi), %xmm5
672 lea -0x40(%rsi), %rsi
673 palignr $3, %xmm2, %xmm1
674 palignr $3, %xmm3, %xmm2
675 palignr $3, %xmm4, %xmm3
676 palignr $3, %xmm5, %xmm4
677
678 movaps %xmm1, -0x10(%rdi)
679 movaps %xmm5, %xmm1
680
681 movaps %xmm2, -0x20(%rdi)
682 lea -0x40(%rdi), %rdi
683
684 movaps %xmm3, 0x10(%rdi)
685 jb L(shl_3_bwd_end)
686 movaps %xmm4, (%rdi)
687 _CET_NOTRACK jmp *%r9
688 ud2
689L(shl_3_bwd_end):
690 movaps %xmm4, (%rdi)
691 lea 64(%rdx), %rdx
692 movdqu %xmm0, (%r8)
693 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
694
695 .p2align 4
696L(shl_4):
697 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
698 cmp %rcx, %rdx
699 movaps -0x04(%rsi), %xmm1
700 jb L(L4_fwd)
701 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
702L(L4_fwd):
703 lea -64(%rdx), %rdx
704 _CET_NOTRACK jmp *%r9
705 ud2
706L(shl_4_loop_L2):
707 prefetchnta 0x1c0(%rsi)
708L(shl_4_loop_L1):
709 sub $64, %rdx
710 movaps 0x0c(%rsi), %xmm2
711 movaps 0x1c(%rsi), %xmm3
712 movaps 0x2c(%rsi), %xmm4
713 movaps 0x3c(%rsi), %xmm5
714 movdqa %xmm5, %xmm6
715 palignr $4, %xmm4, %xmm5
716 lea 64(%rsi), %rsi
717 palignr $4, %xmm3, %xmm4
718 palignr $4, %xmm2, %xmm3
719 lea 64(%rdi), %rdi
720 palignr $4, %xmm1, %xmm2
721 movdqa %xmm6, %xmm1
722 movdqa %xmm2, -0x40(%rdi)
723 movaps %xmm3, -0x30(%rdi)
724 jb L(shl_4_end)
725 movaps %xmm4, -0x20(%rdi)
726 movaps %xmm5, -0x10(%rdi)
727 _CET_NOTRACK jmp *%r9
728 ud2
729L(shl_4_end):
730 movaps %xmm4, -0x20(%rdi)
731 lea 64(%rdx), %rdx
732 movaps %xmm5, -0x10(%rdi)
733 add %rdx, %rdi
734 movdqu %xmm0, (%r8)
735 add %rdx, %rsi
736 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
737
738 .p2align 4
739L(shl_4_bwd):
740 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
741 cmp %rcx, %rdx
742 movaps -0x04(%rsi), %xmm1
743 jb L(L4_bwd)
744 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
745L(L4_bwd):
746 lea -64(%rdx), %rdx
747 _CET_NOTRACK jmp *%r9
748 ud2
749L(shl_4_bwd_loop_L2):
750 prefetchnta -0x1c0(%rsi)
751L(shl_4_bwd_loop_L1):
752 movaps -0x14(%rsi), %xmm2
753 sub $0x40, %rdx
754 movaps -0x24(%rsi), %xmm3
755 movaps -0x34(%rsi), %xmm4
756 movaps -0x44(%rsi), %xmm5
757 lea -0x40(%rsi), %rsi
758 palignr $4, %xmm2, %xmm1
759 palignr $4, %xmm3, %xmm2
760 palignr $4, %xmm4, %xmm3
761 palignr $4, %xmm5, %xmm4
762
763 movaps %xmm1, -0x10(%rdi)
764 movaps %xmm5, %xmm1
765
766 movaps %xmm2, -0x20(%rdi)
767 lea -0x40(%rdi), %rdi
768
769 movaps %xmm3, 0x10(%rdi)
770 jb L(shl_4_bwd_end)
771 movaps %xmm4, (%rdi)
772 _CET_NOTRACK jmp *%r9
773 ud2
774L(shl_4_bwd_end):
775 movaps %xmm4, (%rdi)
776 lea 64(%rdx), %rdx
777 movdqu %xmm0, (%r8)
778 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
779
780 .p2align 4
781L(shl_5):
782 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
783 cmp %rcx, %rdx
784 movaps -0x05(%rsi), %xmm1
785 jb L(L5_fwd)
786 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
787L(L5_fwd):
788 lea -64(%rdx), %rdx
789 _CET_NOTRACK jmp *%r9
790 ud2
791L(shl_5_loop_L2):
792 prefetchnta 0x1c0(%rsi)
793L(shl_5_loop_L1):
794 sub $64, %rdx
795 movaps 0x0b(%rsi), %xmm2
796 movaps 0x1b(%rsi), %xmm3
797 movaps 0x2b(%rsi), %xmm4
798 movaps 0x3b(%rsi), %xmm5
799 movdqa %xmm5, %xmm6
800 palignr $5, %xmm4, %xmm5
801 lea 64(%rsi), %rsi
802 palignr $5, %xmm3, %xmm4
803 palignr $5, %xmm2, %xmm3
804 lea 64(%rdi), %rdi
805 palignr $5, %xmm1, %xmm2
806 movdqa %xmm6, %xmm1
807 movdqa %xmm2, -0x40(%rdi)
808 movaps %xmm3, -0x30(%rdi)
809 jb L(shl_5_end)
810 movaps %xmm4, -0x20(%rdi)
811 movaps %xmm5, -0x10(%rdi)
812 _CET_NOTRACK jmp *%r9
813 ud2
814L(shl_5_end):
815 movaps %xmm4, -0x20(%rdi)
816 lea 64(%rdx), %rdx
817 movaps %xmm5, -0x10(%rdi)
818 add %rdx, %rdi
819 movdqu %xmm0, (%r8)
820 add %rdx, %rsi
821 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
822
823 .p2align 4
824L(shl_5_bwd):
825 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
826 cmp %rcx, %rdx
827 movaps -0x05(%rsi), %xmm1
828 jb L(L5_bwd)
829 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
830L(L5_bwd):
831 lea -64(%rdx), %rdx
832 _CET_NOTRACK jmp *%r9
833 ud2
834L(shl_5_bwd_loop_L2):
835 prefetchnta -0x1c0(%rsi)
836L(shl_5_bwd_loop_L1):
837 movaps -0x15(%rsi), %xmm2
838 sub $0x40, %rdx
839 movaps -0x25(%rsi), %xmm3
840 movaps -0x35(%rsi), %xmm4
841 movaps -0x45(%rsi), %xmm5
842 lea -0x40(%rsi), %rsi
843 palignr $5, %xmm2, %xmm1
844 palignr $5, %xmm3, %xmm2
845 palignr $5, %xmm4, %xmm3
846 palignr $5, %xmm5, %xmm4
847
848 movaps %xmm1, -0x10(%rdi)
849 movaps %xmm5, %xmm1
850
851 movaps %xmm2, -0x20(%rdi)
852 lea -0x40(%rdi), %rdi
853
854 movaps %xmm3, 0x10(%rdi)
855 jb L(shl_5_bwd_end)
856 movaps %xmm4, (%rdi)
857 _CET_NOTRACK jmp *%r9
858 ud2
859L(shl_5_bwd_end):
860 movaps %xmm4, (%rdi)
861 lea 64(%rdx), %rdx
862 movdqu %xmm0, (%r8)
863 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
864
865 .p2align 4
866L(shl_6):
867 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
868 cmp %rcx, %rdx
869 movaps -0x06(%rsi), %xmm1
870 jb L(L6_fwd)
871 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
872L(L6_fwd):
873 lea -64(%rdx), %rdx
874 _CET_NOTRACK jmp *%r9
875 ud2
876L(shl_6_loop_L2):
877 prefetchnta 0x1c0(%rsi)
878L(shl_6_loop_L1):
879 sub $64, %rdx
880 movaps 0x0a(%rsi), %xmm2
881 movaps 0x1a(%rsi), %xmm3
882 movaps 0x2a(%rsi), %xmm4
883 movaps 0x3a(%rsi), %xmm5
884 movdqa %xmm5, %xmm6
885 palignr $6, %xmm4, %xmm5
886 lea 64(%rsi), %rsi
887 palignr $6, %xmm3, %xmm4
888 palignr $6, %xmm2, %xmm3
889 lea 64(%rdi), %rdi
890 palignr $6, %xmm1, %xmm2
891 movdqa %xmm6, %xmm1
892 movdqa %xmm2, -0x40(%rdi)
893 movaps %xmm3, -0x30(%rdi)
894 jb L(shl_6_end)
895 movaps %xmm4, -0x20(%rdi)
896 movaps %xmm5, -0x10(%rdi)
897 _CET_NOTRACK jmp *%r9
898 ud2
899L(shl_6_end):
900 movaps %xmm4, -0x20(%rdi)
901 lea 64(%rdx), %rdx
902 movaps %xmm5, -0x10(%rdi)
903 add %rdx, %rdi
904 movdqu %xmm0, (%r8)
905 add %rdx, %rsi
906 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
907
908 .p2align 4
909L(shl_6_bwd):
910 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
911 cmp %rcx, %rdx
912 movaps -0x06(%rsi), %xmm1
913 jb L(L6_bwd)
914 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
915L(L6_bwd):
916 lea -64(%rdx), %rdx
917 _CET_NOTRACK jmp *%r9
918 ud2
919L(shl_6_bwd_loop_L2):
920 prefetchnta -0x1c0(%rsi)
921L(shl_6_bwd_loop_L1):
922 movaps -0x16(%rsi), %xmm2
923 sub $0x40, %rdx
924 movaps -0x26(%rsi), %xmm3
925 movaps -0x36(%rsi), %xmm4
926 movaps -0x46(%rsi), %xmm5
927 lea -0x40(%rsi), %rsi
928 palignr $6, %xmm2, %xmm1
929 palignr $6, %xmm3, %xmm2
930 palignr $6, %xmm4, %xmm3
931 palignr $6, %xmm5, %xmm4
932
933 movaps %xmm1, -0x10(%rdi)
934 movaps %xmm5, %xmm1
935
936 movaps %xmm2, -0x20(%rdi)
937 lea -0x40(%rdi), %rdi
938
939 movaps %xmm3, 0x10(%rdi)
940 jb L(shl_6_bwd_end)
941 movaps %xmm4, (%rdi)
942 _CET_NOTRACK jmp *%r9
943 ud2
944L(shl_6_bwd_end):
945 movaps %xmm4, (%rdi)
946 lea 64(%rdx), %rdx
947 movdqu %xmm0, (%r8)
948 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
949
950 .p2align 4
951L(shl_7):
952 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
953 cmp %rcx, %rdx
954 movaps -0x07(%rsi), %xmm1
955 jb L(L7_fwd)
956 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
957L(L7_fwd):
958 lea -64(%rdx), %rdx
959 _CET_NOTRACK jmp *%r9
960 ud2
961L(shl_7_loop_L2):
962 prefetchnta 0x1c0(%rsi)
963L(shl_7_loop_L1):
964 sub $64, %rdx
965 movaps 0x09(%rsi), %xmm2
966 movaps 0x19(%rsi), %xmm3
967 movaps 0x29(%rsi), %xmm4
968 movaps 0x39(%rsi), %xmm5
969 movdqa %xmm5, %xmm6
970 palignr $7, %xmm4, %xmm5
971 lea 64(%rsi), %rsi
972 palignr $7, %xmm3, %xmm4
973 palignr $7, %xmm2, %xmm3
974 lea 64(%rdi), %rdi
975 palignr $7, %xmm1, %xmm2
976 movdqa %xmm6, %xmm1
977 movdqa %xmm2, -0x40(%rdi)
978 movaps %xmm3, -0x30(%rdi)
979 jb L(shl_7_end)
980 movaps %xmm4, -0x20(%rdi)
981 movaps %xmm5, -0x10(%rdi)
982 _CET_NOTRACK jmp *%r9
983 ud2
984L(shl_7_end):
985 movaps %xmm4, -0x20(%rdi)
986 lea 64(%rdx), %rdx
987 movaps %xmm5, -0x10(%rdi)
988 add %rdx, %rdi
989 movdqu %xmm0, (%r8)
990 add %rdx, %rsi
991 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
992
993 .p2align 4
994L(shl_7_bwd):
995 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
996 cmp %rcx, %rdx
997 movaps -0x07(%rsi), %xmm1
998 jb L(L7_bwd)
999 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
1000L(L7_bwd):
1001 lea -64(%rdx), %rdx
1002 _CET_NOTRACK jmp *%r9
1003 ud2
1004L(shl_7_bwd_loop_L2):
1005 prefetchnta -0x1c0(%rsi)
1006L(shl_7_bwd_loop_L1):
1007 movaps -0x17(%rsi), %xmm2
1008 sub $0x40, %rdx
1009 movaps -0x27(%rsi), %xmm3
1010 movaps -0x37(%rsi), %xmm4
1011 movaps -0x47(%rsi), %xmm5
1012 lea -0x40(%rsi), %rsi
1013 palignr $7, %xmm2, %xmm1
1014 palignr $7, %xmm3, %xmm2
1015 palignr $7, %xmm4, %xmm3
1016 palignr $7, %xmm5, %xmm4
1017
1018 movaps %xmm1, -0x10(%rdi)
1019 movaps %xmm5, %xmm1
1020
1021 movaps %xmm2, -0x20(%rdi)
1022 lea -0x40(%rdi), %rdi
1023
1024 movaps %xmm3, 0x10(%rdi)
1025 jb L(shl_7_bwd_end)
1026 movaps %xmm4, (%rdi)
1027 _CET_NOTRACK jmp *%r9
1028 ud2
1029L(shl_7_bwd_end):
1030 movaps %xmm4, (%rdi)
1031 lea 64(%rdx), %rdx
1032 movdqu %xmm0, (%r8)
1033 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1034
1035 .p2align 4
1036L(shl_8):
1037 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1038 cmp %rcx, %rdx
1039 movaps -0x08(%rsi), %xmm1
1040 jb L(L8_fwd)
1041 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1042L(L8_fwd):
1043 lea -64(%rdx), %rdx
1044 _CET_NOTRACK jmp *%r9
1045L(shl_8_loop_L2):
1046 prefetchnta 0x1c0(%rsi)
1047L(shl_8_loop_L1):
1048 sub $64, %rdx
1049 movaps 0x08(%rsi), %xmm2
1050 movaps 0x18(%rsi), %xmm3
1051 movaps 0x28(%rsi), %xmm4
1052 movaps 0x38(%rsi), %xmm5
1053 movdqa %xmm5, %xmm6
1054 palignr $8, %xmm4, %xmm5
1055 lea 64(%rsi), %rsi
1056 palignr $8, %xmm3, %xmm4
1057 palignr $8, %xmm2, %xmm3
1058 lea 64(%rdi), %rdi
1059 palignr $8, %xmm1, %xmm2
1060 movdqa %xmm6, %xmm1
1061 movdqa %xmm2, -0x40(%rdi)
1062 movaps %xmm3, -0x30(%rdi)
1063 jb L(shl_8_end)
1064 movaps %xmm4, -0x20(%rdi)
1065 movaps %xmm5, -0x10(%rdi)
1066 _CET_NOTRACK jmp *%r9
1067 ud2
1068 .p2align 4
1069L(shl_8_end):
1070 lea 64(%rdx), %rdx
1071 movaps %xmm4, -0x20(%rdi)
1072 add %rdx, %rsi
1073 movaps %xmm5, -0x10(%rdi)
1074 add %rdx, %rdi
1075 movdqu %xmm0, (%r8)
1076 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1077
1078 .p2align 4
1079L(shl_8_bwd):
1080 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1081 cmp %rcx, %rdx
1082 movaps -0x08(%rsi), %xmm1
1083 jb L(L8_bwd)
1084 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1085L(L8_bwd):
1086 lea -64(%rdx), %rdx
1087 _CET_NOTRACK jmp *%r9
1088 ud2
1089L(shl_8_bwd_loop_L2):
1090 prefetchnta -0x1c0(%rsi)
1091L(shl_8_bwd_loop_L1):
1092 movaps -0x18(%rsi), %xmm2
1093 sub $0x40, %rdx
1094 movaps -0x28(%rsi), %xmm3
1095 movaps -0x38(%rsi), %xmm4
1096 movaps -0x48(%rsi), %xmm5
1097 lea -0x40(%rsi), %rsi
1098 palignr $8, %xmm2, %xmm1
1099 palignr $8, %xmm3, %xmm2
1100 palignr $8, %xmm4, %xmm3
1101 palignr $8, %xmm5, %xmm4
1102
1103 movaps %xmm1, -0x10(%rdi)
1104 movaps %xmm5, %xmm1
1105
1106 movaps %xmm2, -0x20(%rdi)
1107 lea -0x40(%rdi), %rdi
1108
1109 movaps %xmm3, 0x10(%rdi)
1110 jb L(shl_8_bwd_end)
1111 movaps %xmm4, (%rdi)
1112 _CET_NOTRACK jmp *%r9
1113 ud2
1114L(shl_8_bwd_end):
1115 movaps %xmm4, (%rdi)
1116 lea 64(%rdx), %rdx
1117 movdqu %xmm0, (%r8)
1118 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1119
1120 .p2align 4
1121L(shl_9):
1122 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1123 cmp %rcx, %rdx
1124 movaps -0x09(%rsi), %xmm1
1125 jb L(L9_fwd)
1126 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1127L(L9_fwd):
1128 lea -64(%rdx), %rdx
1129 _CET_NOTRACK jmp *%r9
1130 ud2
1131L(shl_9_loop_L2):
1132 prefetchnta 0x1c0(%rsi)
1133L(shl_9_loop_L1):
1134 sub $64, %rdx
1135 movaps 0x07(%rsi), %xmm2
1136 movaps 0x17(%rsi), %xmm3
1137 movaps 0x27(%rsi), %xmm4
1138 movaps 0x37(%rsi), %xmm5
1139 movdqa %xmm5, %xmm6
1140 palignr $9, %xmm4, %xmm5
1141 lea 64(%rsi), %rsi
1142 palignr $9, %xmm3, %xmm4
1143 palignr $9, %xmm2, %xmm3
1144 lea 64(%rdi), %rdi
1145 palignr $9, %xmm1, %xmm2
1146 movdqa %xmm6, %xmm1
1147 movdqa %xmm2, -0x40(%rdi)
1148 movaps %xmm3, -0x30(%rdi)
1149 jb L(shl_9_end)
1150 movaps %xmm4, -0x20(%rdi)
1151 movaps %xmm5, -0x10(%rdi)
1152 _CET_NOTRACK jmp *%r9
1153 ud2
1154L(shl_9_end):
1155 movaps %xmm4, -0x20(%rdi)
1156 lea 64(%rdx), %rdx
1157 movaps %xmm5, -0x10(%rdi)
1158 add %rdx, %rdi
1159 movdqu %xmm0, (%r8)
1160 add %rdx, %rsi
1161 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1162
1163 .p2align 4
1164L(shl_9_bwd):
1165 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1166 cmp %rcx, %rdx
1167 movaps -0x09(%rsi), %xmm1
1168 jb L(L9_bwd)
1169 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1170L(L9_bwd):
1171 lea -64(%rdx), %rdx
1172 _CET_NOTRACK jmp *%r9
1173 ud2
1174L(shl_9_bwd_loop_L2):
1175 prefetchnta -0x1c0(%rsi)
1176L(shl_9_bwd_loop_L1):
1177 movaps -0x19(%rsi), %xmm2
1178 sub $0x40, %rdx
1179 movaps -0x29(%rsi), %xmm3
1180 movaps -0x39(%rsi), %xmm4
1181 movaps -0x49(%rsi), %xmm5
1182 lea -0x40(%rsi), %rsi
1183 palignr $9, %xmm2, %xmm1
1184 palignr $9, %xmm3, %xmm2
1185 palignr $9, %xmm4, %xmm3
1186 palignr $9, %xmm5, %xmm4
1187
1188 movaps %xmm1, -0x10(%rdi)
1189 movaps %xmm5, %xmm1
1190
1191 movaps %xmm2, -0x20(%rdi)
1192 lea -0x40(%rdi), %rdi
1193
1194 movaps %xmm3, 0x10(%rdi)
1195 jb L(shl_9_bwd_end)
1196 movaps %xmm4, (%rdi)
1197 _CET_NOTRACK jmp *%r9
1198 ud2
1199L(shl_9_bwd_end):
1200 movaps %xmm4, (%rdi)
1201 lea 64(%rdx), %rdx
1202 movdqu %xmm0, (%r8)
1203 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1204
1205 .p2align 4
1206L(shl_10):
1207 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1208 cmp %rcx, %rdx
1209 movaps -0x0a(%rsi), %xmm1
1210 jb L(L10_fwd)
1211 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1212L(L10_fwd):
1213 lea -64(%rdx), %rdx
1214 _CET_NOTRACK jmp *%r9
1215 ud2
1216L(shl_10_loop_L2):
1217 prefetchnta 0x1c0(%rsi)
1218L(shl_10_loop_L1):
1219 sub $64, %rdx
1220 movaps 0x06(%rsi), %xmm2
1221 movaps 0x16(%rsi), %xmm3
1222 movaps 0x26(%rsi), %xmm4
1223 movaps 0x36(%rsi), %xmm5
1224 movdqa %xmm5, %xmm6
1225 palignr $10, %xmm4, %xmm5
1226 lea 64(%rsi), %rsi
1227 palignr $10, %xmm3, %xmm4
1228 palignr $10, %xmm2, %xmm3
1229 lea 64(%rdi), %rdi
1230 palignr $10, %xmm1, %xmm2
1231 movdqa %xmm6, %xmm1
1232 movdqa %xmm2, -0x40(%rdi)
1233 movaps %xmm3, -0x30(%rdi)
1234 jb L(shl_10_end)
1235 movaps %xmm4, -0x20(%rdi)
1236 movaps %xmm5, -0x10(%rdi)
1237 _CET_NOTRACK jmp *%r9
1238 ud2
1239L(shl_10_end):
1240 movaps %xmm4, -0x20(%rdi)
1241 lea 64(%rdx), %rdx
1242 movaps %xmm5, -0x10(%rdi)
1243 add %rdx, %rdi
1244 movdqu %xmm0, (%r8)
1245 add %rdx, %rsi
1246 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1247
1248 .p2align 4
1249L(shl_10_bwd):
1250 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1251 cmp %rcx, %rdx
1252 movaps -0x0a(%rsi), %xmm1
1253 jb L(L10_bwd)
1254 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1255L(L10_bwd):
1256 lea -64(%rdx), %rdx
1257 _CET_NOTRACK jmp *%r9
1258 ud2
1259L(shl_10_bwd_loop_L2):
1260 prefetchnta -0x1c0(%rsi)
1261L(shl_10_bwd_loop_L1):
1262 movaps -0x1a(%rsi), %xmm2
1263 sub $0x40, %rdx
1264 movaps -0x2a(%rsi), %xmm3
1265 movaps -0x3a(%rsi), %xmm4
1266 movaps -0x4a(%rsi), %xmm5
1267 lea -0x40(%rsi), %rsi
1268 palignr $10, %xmm2, %xmm1
1269 palignr $10, %xmm3, %xmm2
1270 palignr $10, %xmm4, %xmm3
1271 palignr $10, %xmm5, %xmm4
1272
1273 movaps %xmm1, -0x10(%rdi)
1274 movaps %xmm5, %xmm1
1275
1276 movaps %xmm2, -0x20(%rdi)
1277 lea -0x40(%rdi), %rdi
1278
1279 movaps %xmm3, 0x10(%rdi)
1280 jb L(shl_10_bwd_end)
1281 movaps %xmm4, (%rdi)
1282 _CET_NOTRACK jmp *%r9
1283 ud2
1284L(shl_10_bwd_end):
1285 movaps %xmm4, (%rdi)
1286 lea 64(%rdx), %rdx
1287 movdqu %xmm0, (%r8)
1288 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1289
1290 .p2align 4
1291L(shl_11):
1292 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1293 cmp %rcx, %rdx
1294 movaps -0x0b(%rsi), %xmm1
1295 jb L(L11_fwd)
1296 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1297L(L11_fwd):
1298 lea -64(%rdx), %rdx
1299 _CET_NOTRACK jmp *%r9
1300 ud2
1301L(shl_11_loop_L2):
1302 prefetchnta 0x1c0(%rsi)
1303L(shl_11_loop_L1):
1304 sub $64, %rdx
1305 movaps 0x05(%rsi), %xmm2
1306 movaps 0x15(%rsi), %xmm3
1307 movaps 0x25(%rsi), %xmm4
1308 movaps 0x35(%rsi), %xmm5
1309 movdqa %xmm5, %xmm6
1310 palignr $11, %xmm4, %xmm5
1311 lea 64(%rsi), %rsi
1312 palignr $11, %xmm3, %xmm4
1313 palignr $11, %xmm2, %xmm3
1314 lea 64(%rdi), %rdi
1315 palignr $11, %xmm1, %xmm2
1316 movdqa %xmm6, %xmm1
1317 movdqa %xmm2, -0x40(%rdi)
1318 movaps %xmm3, -0x30(%rdi)
1319 jb L(shl_11_end)
1320 movaps %xmm4, -0x20(%rdi)
1321 movaps %xmm5, -0x10(%rdi)
1322 _CET_NOTRACK jmp *%r9
1323 ud2
1324L(shl_11_end):
1325 movaps %xmm4, -0x20(%rdi)
1326 lea 64(%rdx), %rdx
1327 movaps %xmm5, -0x10(%rdi)
1328 add %rdx, %rdi
1329 movdqu %xmm0, (%r8)
1330 add %rdx, %rsi
1331 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1332
1333 .p2align 4
1334L(shl_11_bwd):
1335 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1336 cmp %rcx, %rdx
1337 movaps -0x0b(%rsi), %xmm1
1338 jb L(L11_bwd)
1339 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1340L(L11_bwd):
1341 lea -64(%rdx), %rdx
1342 _CET_NOTRACK jmp *%r9
1343 ud2
1344L(shl_11_bwd_loop_L2):
1345 prefetchnta -0x1c0(%rsi)
1346L(shl_11_bwd_loop_L1):
1347 movaps -0x1b(%rsi), %xmm2
1348 sub $0x40, %rdx
1349 movaps -0x2b(%rsi), %xmm3
1350 movaps -0x3b(%rsi), %xmm4
1351 movaps -0x4b(%rsi), %xmm5
1352 lea -0x40(%rsi), %rsi
1353 palignr $11, %xmm2, %xmm1
1354 palignr $11, %xmm3, %xmm2
1355 palignr $11, %xmm4, %xmm3
1356 palignr $11, %xmm5, %xmm4
1357
1358 movaps %xmm1, -0x10(%rdi)
1359 movaps %xmm5, %xmm1
1360
1361 movaps %xmm2, -0x20(%rdi)
1362 lea -0x40(%rdi), %rdi
1363
1364 movaps %xmm3, 0x10(%rdi)
1365 jb L(shl_11_bwd_end)
1366 movaps %xmm4, (%rdi)
1367 _CET_NOTRACK jmp *%r9
1368 ud2
1369L(shl_11_bwd_end):
1370 movaps %xmm4, (%rdi)
1371 lea 64(%rdx), %rdx
1372 movdqu %xmm0, (%r8)
1373 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1374
1375 .p2align 4
1376L(shl_12):
1377 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1378 cmp %rcx, %rdx
1379 movaps -0x0c(%rsi), %xmm1
1380 jb L(L12_fwd)
1381 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1382L(L12_fwd):
1383 lea -64(%rdx), %rdx
1384 _CET_NOTRACK jmp *%r9
1385 ud2
1386L(shl_12_loop_L2):
1387 prefetchnta 0x1c0(%rsi)
1388L(shl_12_loop_L1):
1389 sub $64, %rdx
1390 movaps 0x04(%rsi), %xmm2
1391 movaps 0x14(%rsi), %xmm3
1392 movaps 0x24(%rsi), %xmm4
1393 movaps 0x34(%rsi), %xmm5
1394 movdqa %xmm5, %xmm6
1395 palignr $12, %xmm4, %xmm5
1396 lea 64(%rsi), %rsi
1397 palignr $12, %xmm3, %xmm4
1398 palignr $12, %xmm2, %xmm3
1399 lea 64(%rdi), %rdi
1400 palignr $12, %xmm1, %xmm2
1401 movdqa %xmm6, %xmm1
1402 movdqa %xmm2, -0x40(%rdi)
1403 movaps %xmm3, -0x30(%rdi)
1404 jb L(shl_12_end)
1405 movaps %xmm4, -0x20(%rdi)
1406 movaps %xmm5, -0x10(%rdi)
1407 _CET_NOTRACK jmp *%r9
1408 ud2
1409L(shl_12_end):
1410 movaps %xmm4, -0x20(%rdi)
1411 lea 64(%rdx), %rdx
1412 movaps %xmm5, -0x10(%rdi)
1413 add %rdx, %rdi
1414 movdqu %xmm0, (%r8)
1415 add %rdx, %rsi
1416 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1417
1418 .p2align 4
1419L(shl_12_bwd):
1420 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1421 cmp %rcx, %rdx
1422 movaps -0x0c(%rsi), %xmm1
1423 jb L(L12_bwd)
1424 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1425L(L12_bwd):
1426 lea -64(%rdx), %rdx
1427 _CET_NOTRACK jmp *%r9
1428 ud2
1429L(shl_12_bwd_loop_L2):
1430 prefetchnta -0x1c0(%rsi)
1431L(shl_12_bwd_loop_L1):
1432 movaps -0x1c(%rsi), %xmm2
1433 sub $0x40, %rdx
1434 movaps -0x2c(%rsi), %xmm3
1435 movaps -0x3c(%rsi), %xmm4
1436 movaps -0x4c(%rsi), %xmm5
1437 lea -0x40(%rsi), %rsi
1438 palignr $12, %xmm2, %xmm1
1439 palignr $12, %xmm3, %xmm2
1440 palignr $12, %xmm4, %xmm3
1441 palignr $12, %xmm5, %xmm4
1442
1443 movaps %xmm1, -0x10(%rdi)
1444 movaps %xmm5, %xmm1
1445
1446 movaps %xmm2, -0x20(%rdi)
1447 lea -0x40(%rdi), %rdi
1448
1449 movaps %xmm3, 0x10(%rdi)
1450 jb L(shl_12_bwd_end)
1451 movaps %xmm4, (%rdi)
1452 _CET_NOTRACK jmp *%r9
1453 ud2
1454L(shl_12_bwd_end):
1455 movaps %xmm4, (%rdi)
1456 lea 64(%rdx), %rdx
1457 movdqu %xmm0, (%r8)
1458 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1459
1460 .p2align 4
1461L(shl_13):
1462 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1463 cmp %rcx, %rdx
1464 movaps -0x0d(%rsi), %xmm1
1465 jb L(L13_fwd)
1466 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1467L(L13_fwd):
1468 lea -64(%rdx), %rdx
1469 _CET_NOTRACK jmp *%r9
1470 ud2
1471L(shl_13_loop_L2):
1472 prefetchnta 0x1c0(%rsi)
1473L(shl_13_loop_L1):
1474 sub $64, %rdx
1475 movaps 0x03(%rsi), %xmm2
1476 movaps 0x13(%rsi), %xmm3
1477 movaps 0x23(%rsi), %xmm4
1478 movaps 0x33(%rsi), %xmm5
1479 movdqa %xmm5, %xmm6
1480 palignr $13, %xmm4, %xmm5
1481 lea 64(%rsi), %rsi
1482 palignr $13, %xmm3, %xmm4
1483 palignr $13, %xmm2, %xmm3
1484 lea 64(%rdi), %rdi
1485 palignr $13, %xmm1, %xmm2
1486 movdqa %xmm6, %xmm1
1487 movdqa %xmm2, -0x40(%rdi)
1488 movaps %xmm3, -0x30(%rdi)
1489 jb L(shl_13_end)
1490 movaps %xmm4, -0x20(%rdi)
1491 movaps %xmm5, -0x10(%rdi)
1492 _CET_NOTRACK jmp *%r9
1493 ud2
1494L(shl_13_end):
1495 movaps %xmm4, -0x20(%rdi)
1496 lea 64(%rdx), %rdx
1497 movaps %xmm5, -0x10(%rdi)
1498 add %rdx, %rdi
1499 movdqu %xmm0, (%r8)
1500 add %rdx, %rsi
1501 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1502
1503 .p2align 4
1504L(shl_13_bwd):
1505 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1506 cmp %rcx, %rdx
1507 movaps -0x0d(%rsi), %xmm1
1508 jb L(L13_bwd)
1509 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1510L(L13_bwd):
1511 lea -64(%rdx), %rdx
1512 _CET_NOTRACK jmp *%r9
1513 ud2
1514L(shl_13_bwd_loop_L2):
1515 prefetchnta -0x1c0(%rsi)
1516L(shl_13_bwd_loop_L1):
1517 movaps -0x1d(%rsi), %xmm2
1518 sub $0x40, %rdx
1519 movaps -0x2d(%rsi), %xmm3
1520 movaps -0x3d(%rsi), %xmm4
1521 movaps -0x4d(%rsi), %xmm5
1522 lea -0x40(%rsi), %rsi
1523 palignr $13, %xmm2, %xmm1
1524 palignr $13, %xmm3, %xmm2
1525 palignr $13, %xmm4, %xmm3
1526 palignr $13, %xmm5, %xmm4
1527
1528 movaps %xmm1, -0x10(%rdi)
1529 movaps %xmm5, %xmm1
1530
1531 movaps %xmm2, -0x20(%rdi)
1532 lea -0x40(%rdi), %rdi
1533
1534 movaps %xmm3, 0x10(%rdi)
1535 jb L(shl_13_bwd_end)
1536 movaps %xmm4, (%rdi)
1537 _CET_NOTRACK jmp *%r9
1538 ud2
1539L(shl_13_bwd_end):
1540 movaps %xmm4, (%rdi)
1541 lea 64(%rdx), %rdx
1542 movdqu %xmm0, (%r8)
1543 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1544
1545 .p2align 4
1546L(shl_14):
1547 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1548 cmp %rcx, %rdx
1549 movaps -0x0e(%rsi), %xmm1
1550 jb L(L14_fwd)
1551 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1552L(L14_fwd):
1553 lea -64(%rdx), %rdx
1554 _CET_NOTRACK jmp *%r9
1555 ud2
1556L(shl_14_loop_L2):
1557 prefetchnta 0x1c0(%rsi)
1558L(shl_14_loop_L1):
1559 sub $64, %rdx
1560 movaps 0x02(%rsi), %xmm2
1561 movaps 0x12(%rsi), %xmm3
1562 movaps 0x22(%rsi), %xmm4
1563 movaps 0x32(%rsi), %xmm5
1564 movdqa %xmm5, %xmm6
1565 palignr $14, %xmm4, %xmm5
1566 lea 64(%rsi), %rsi
1567 palignr $14, %xmm3, %xmm4
1568 palignr $14, %xmm2, %xmm3
1569 lea 64(%rdi), %rdi
1570 palignr $14, %xmm1, %xmm2
1571 movdqa %xmm6, %xmm1
1572 movdqa %xmm2, -0x40(%rdi)
1573 movaps %xmm3, -0x30(%rdi)
1574 jb L(shl_14_end)
1575 movaps %xmm4, -0x20(%rdi)
1576 movaps %xmm5, -0x10(%rdi)
1577 _CET_NOTRACK jmp *%r9
1578 ud2
1579L(shl_14_end):
1580 movaps %xmm4, -0x20(%rdi)
1581 lea 64(%rdx), %rdx
1582 movaps %xmm5, -0x10(%rdi)
1583 add %rdx, %rdi
1584 movdqu %xmm0, (%r8)
1585 add %rdx, %rsi
1586 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1587
1588 .p2align 4
1589L(shl_14_bwd):
1590 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1591 cmp %rcx, %rdx
1592 movaps -0x0e(%rsi), %xmm1
1593 jb L(L14_bwd)
1594 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1595L(L14_bwd):
1596 lea -64(%rdx), %rdx
1597 _CET_NOTRACK jmp *%r9
1598 ud2
1599L(shl_14_bwd_loop_L2):
1600 prefetchnta -0x1c0(%rsi)
1601L(shl_14_bwd_loop_L1):
1602 movaps -0x1e(%rsi), %xmm2
1603 sub $0x40, %rdx
1604 movaps -0x2e(%rsi), %xmm3
1605 movaps -0x3e(%rsi), %xmm4
1606 movaps -0x4e(%rsi), %xmm5
1607 lea -0x40(%rsi), %rsi
1608 palignr $14, %xmm2, %xmm1
1609 palignr $14, %xmm3, %xmm2
1610 palignr $14, %xmm4, %xmm3
1611 palignr $14, %xmm5, %xmm4
1612
1613 movaps %xmm1, -0x10(%rdi)
1614 movaps %xmm5, %xmm1
1615
1616 movaps %xmm2, -0x20(%rdi)
1617 lea -0x40(%rdi), %rdi
1618
1619 movaps %xmm3, 0x10(%rdi)
1620 jb L(shl_14_bwd_end)
1621 movaps %xmm4, (%rdi)
1622 _CET_NOTRACK jmp *%r9
1623 ud2
1624L(shl_14_bwd_end):
1625 movaps %xmm4, (%rdi)
1626 lea 64(%rdx), %rdx
1627 movdqu %xmm0, (%r8)
1628 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1629
1630 .p2align 4
1631L(shl_15):
1632 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1633 cmp %rcx, %rdx
1634 movaps -0x0f(%rsi), %xmm1
1635 jb L(L15_fwd)
1636 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1637L(L15_fwd):
1638 lea -64(%rdx), %rdx
1639 _CET_NOTRACK jmp *%r9
1640 ud2
1641L(shl_15_loop_L2):
1642 prefetchnta 0x1c0(%rsi)
1643L(shl_15_loop_L1):
1644 sub $64, %rdx
1645 movaps 0x01(%rsi), %xmm2
1646 movaps 0x11(%rsi), %xmm3
1647 movaps 0x21(%rsi), %xmm4
1648 movaps 0x31(%rsi), %xmm5
1649 movdqa %xmm5, %xmm6
1650 palignr $15, %xmm4, %xmm5
1651 lea 64(%rsi), %rsi
1652 palignr $15, %xmm3, %xmm4
1653 palignr $15, %xmm2, %xmm3
1654 lea 64(%rdi), %rdi
1655 palignr $15, %xmm1, %xmm2
1656 movdqa %xmm6, %xmm1
1657 movdqa %xmm2, -0x40(%rdi)
1658 movaps %xmm3, -0x30(%rdi)
1659 jb L(shl_15_end)
1660 movaps %xmm4, -0x20(%rdi)
1661 movaps %xmm5, -0x10(%rdi)
1662 _CET_NOTRACK jmp *%r9
1663 ud2
1664L(shl_15_end):
1665 movaps %xmm4, -0x20(%rdi)
1666 lea 64(%rdx), %rdx
1667 movaps %xmm5, -0x10(%rdi)
1668 add %rdx, %rdi
1669 movdqu %xmm0, (%r8)
1670 add %rdx, %rsi
1671 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1672
1673 .p2align 4
1674L(shl_15_bwd):
1675 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1676 cmp %rcx, %rdx
1677 movaps -0x0f(%rsi), %xmm1
1678 jb L(L15_bwd)
1679 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1680L(L15_bwd):
1681 lea -64(%rdx), %rdx
1682 _CET_NOTRACK jmp *%r9
1683 ud2
1684L(shl_15_bwd_loop_L2):
1685 prefetchnta -0x1c0(%rsi)
1686L(shl_15_bwd_loop_L1):
1687 movaps -0x1f(%rsi), %xmm2
1688 sub $0x40, %rdx
1689 movaps -0x2f(%rsi), %xmm3
1690 movaps -0x3f(%rsi), %xmm4
1691 movaps -0x4f(%rsi), %xmm5
1692 lea -0x40(%rsi), %rsi
1693 palignr $15, %xmm2, %xmm1
1694 palignr $15, %xmm3, %xmm2
1695 palignr $15, %xmm4, %xmm3
1696 palignr $15, %xmm5, %xmm4
1697
1698 movaps %xmm1, -0x10(%rdi)
1699 movaps %xmm5, %xmm1
1700
1701 movaps %xmm2, -0x20(%rdi)
1702 lea -0x40(%rdi), %rdi
1703
1704 movaps %xmm3, 0x10(%rdi)
1705 jb L(shl_15_bwd_end)
1706 movaps %xmm4, (%rdi)
1707 _CET_NOTRACK jmp *%r9
1708 ud2
1709L(shl_15_bwd_end):
1710 movaps %xmm4, (%rdi)
1711 lea 64(%rdx), %rdx
1712 movdqu %xmm0, (%r8)
1713 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1714
1715 .p2align 4
1716L(write_72bytes):
1717 movdqu -72(%rsi), %xmm0
1718 movdqu -56(%rsi), %xmm1
1719 mov -40(%rsi), %r8
1720 mov -32(%rsi), %r9
1721 mov -24(%rsi), %r10
1722 mov -16(%rsi), %r11
1723 mov -8(%rsi), %rcx
1724 movdqu %xmm0, -72(%rdi)
1725 movdqu %xmm1, -56(%rdi)
1726 mov %r8, -40(%rdi)
1727 mov %r9, -32(%rdi)
1728 mov %r10, -24(%rdi)
1729 mov %r11, -16(%rdi)
1730 mov %rcx, -8(%rdi)
1731 ret
1732
1733 .p2align 4
1734L(write_64bytes):
1735 movdqu -64(%rsi), %xmm0
1736 mov -48(%rsi), %rcx
1737 mov -40(%rsi), %r8
1738 mov -32(%rsi), %r9
1739 mov -24(%rsi), %r10
1740 mov -16(%rsi), %r11
1741 mov -8(%rsi), %rdx
1742 movdqu %xmm0, -64(%rdi)
1743 mov %rcx, -48(%rdi)
1744 mov %r8, -40(%rdi)
1745 mov %r9, -32(%rdi)
1746 mov %r10, -24(%rdi)
1747 mov %r11, -16(%rdi)
1748 mov %rdx, -8(%rdi)
1749 ret
1750
1751 .p2align 4
1752L(write_56bytes):
1753 movdqu -56(%rsi), %xmm0
1754 mov -40(%rsi), %r8
1755 mov -32(%rsi), %r9
1756 mov -24(%rsi), %r10
1757 mov -16(%rsi), %r11
1758 mov -8(%rsi), %rcx
1759 movdqu %xmm0, -56(%rdi)
1760 mov %r8, -40(%rdi)
1761 mov %r9, -32(%rdi)
1762 mov %r10, -24(%rdi)
1763 mov %r11, -16(%rdi)
1764 mov %rcx, -8(%rdi)
1765 ret
1766
1767 .p2align 4
1768L(write_48bytes):
1769 mov -48(%rsi), %rcx
1770 mov -40(%rsi), %r8
1771 mov -32(%rsi), %r9
1772 mov -24(%rsi), %r10
1773 mov -16(%rsi), %r11
1774 mov -8(%rsi), %rdx
1775 mov %rcx, -48(%rdi)
1776 mov %r8, -40(%rdi)
1777 mov %r9, -32(%rdi)
1778 mov %r10, -24(%rdi)
1779 mov %r11, -16(%rdi)
1780 mov %rdx, -8(%rdi)
1781 ret
1782
1783 .p2align 4
1784L(write_40bytes):
1785 mov -40(%rsi), %r8
1786 mov -32(%rsi), %r9
1787 mov -24(%rsi), %r10
1788 mov -16(%rsi), %r11
1789 mov -8(%rsi), %rdx
1790 mov %r8, -40(%rdi)
1791 mov %r9, -32(%rdi)
1792 mov %r10, -24(%rdi)
1793 mov %r11, -16(%rdi)
1794 mov %rdx, -8(%rdi)
1795 ret
1796
1797 .p2align 4
1798L(write_32bytes):
1799 mov -32(%rsi), %r9
1800 mov -24(%rsi), %r10
1801 mov -16(%rsi), %r11
1802 mov -8(%rsi), %rdx
1803 mov %r9, -32(%rdi)
1804 mov %r10, -24(%rdi)
1805 mov %r11, -16(%rdi)
1806 mov %rdx, -8(%rdi)
1807 ret
1808
1809 .p2align 4
1810L(write_24bytes):
1811 mov -24(%rsi), %r10
1812 mov -16(%rsi), %r11
1813 mov -8(%rsi), %rdx
1814 mov %r10, -24(%rdi)
1815 mov %r11, -16(%rdi)
1816 mov %rdx, -8(%rdi)
1817 ret
1818
1819 .p2align 4
1820L(write_16bytes):
1821 mov -16(%rsi), %r11
1822 mov -8(%rsi), %rdx
1823 mov %r11, -16(%rdi)
1824 mov %rdx, -8(%rdi)
1825 ret
1826
1827 .p2align 4
1828L(write_8bytes):
1829 mov -8(%rsi), %rdx
1830 mov %rdx, -8(%rdi)
1831L(write_0bytes):
1832 ret
1833
1834 .p2align 4
1835L(write_73bytes):
1836 movdqu -73(%rsi), %xmm0
1837 movdqu -57(%rsi), %xmm1
1838 mov -41(%rsi), %rcx
1839 mov -33(%rsi), %r9
1840 mov -25(%rsi), %r10
1841 mov -17(%rsi), %r11
1842 mov -9(%rsi), %r8
1843 mov -4(%rsi), %edx
1844 movdqu %xmm0, -73(%rdi)
1845 movdqu %xmm1, -57(%rdi)
1846 mov %rcx, -41(%rdi)
1847 mov %r9, -33(%rdi)
1848 mov %r10, -25(%rdi)
1849 mov %r11, -17(%rdi)
1850 mov %r8, -9(%rdi)
1851 mov %edx, -4(%rdi)
1852 ret
1853
1854 .p2align 4
1855L(write_65bytes):
1856 movdqu -65(%rsi), %xmm0
1857 movdqu -49(%rsi), %xmm1
1858 mov -33(%rsi), %r9
1859 mov -25(%rsi), %r10
1860 mov -17(%rsi), %r11
1861 mov -9(%rsi), %rcx
1862 mov -4(%rsi), %edx
1863 movdqu %xmm0, -65(%rdi)
1864 movdqu %xmm1, -49(%rdi)
1865 mov %r9, -33(%rdi)
1866 mov %r10, -25(%rdi)
1867 mov %r11, -17(%rdi)
1868 mov %rcx, -9(%rdi)
1869 mov %edx, -4(%rdi)
1870 ret
1871
1872 .p2align 4
1873L(write_57bytes):
1874 movdqu -57(%rsi), %xmm0
1875 mov -41(%rsi), %r8
1876 mov -33(%rsi), %r9
1877 mov -25(%rsi), %r10
1878 mov -17(%rsi), %r11
1879 mov -9(%rsi), %rcx
1880 mov -4(%rsi), %edx
1881 movdqu %xmm0, -57(%rdi)
1882 mov %r8, -41(%rdi)
1883 mov %r9, -33(%rdi)
1884 mov %r10, -25(%rdi)
1885 mov %r11, -17(%rdi)
1886 mov %rcx, -9(%rdi)
1887 mov %edx, -4(%rdi)
1888 ret
1889
1890 .p2align 4
1891L(write_49bytes):
1892 movdqu -49(%rsi), %xmm0
1893 mov -33(%rsi), %r9
1894 mov -25(%rsi), %r10
1895 mov -17(%rsi), %r11
1896 mov -9(%rsi), %rcx
1897 mov -4(%rsi), %edx
1898 movdqu %xmm0, -49(%rdi)
1899 mov %r9, -33(%rdi)
1900 mov %r10, -25(%rdi)
1901 mov %r11, -17(%rdi)
1902 mov %rcx, -9(%rdi)
1903 mov %edx, -4(%rdi)
1904 ret
1905
1906 .p2align 4
1907L(write_41bytes):
1908 mov -41(%rsi), %r8
1909 mov -33(%rsi), %r9
1910 mov -25(%rsi), %r10
1911 mov -17(%rsi), %r11
1912 mov -9(%rsi), %rcx
1913 mov -1(%rsi), %dl
1914 mov %r8, -41(%rdi)
1915 mov %r9, -33(%rdi)
1916 mov %r10, -25(%rdi)
1917 mov %r11, -17(%rdi)
1918 mov %rcx, -9(%rdi)
1919 mov %dl, -1(%rdi)
1920 ret
1921
1922 .p2align 4
1923L(write_33bytes):
1924 mov -33(%rsi), %r9
1925 mov -25(%rsi), %r10
1926 mov -17(%rsi), %r11
1927 mov -9(%rsi), %rcx
1928 mov -1(%rsi), %dl
1929 mov %r9, -33(%rdi)
1930 mov %r10, -25(%rdi)
1931 mov %r11, -17(%rdi)
1932 mov %rcx, -9(%rdi)
1933 mov %dl, -1(%rdi)
1934 ret
1935
1936 .p2align 4
1937L(write_25bytes):
1938 mov -25(%rsi), %r10
1939 mov -17(%rsi), %r11
1940 mov -9(%rsi), %rcx
1941 mov -1(%rsi), %dl
1942 mov %r10, -25(%rdi)
1943 mov %r11, -17(%rdi)
1944 mov %rcx, -9(%rdi)
1945 mov %dl, -1(%rdi)
1946 ret
1947
1948 .p2align 4
1949L(write_17bytes):
1950 mov -17(%rsi), %r11
1951 mov -9(%rsi), %rcx
1952 mov -4(%rsi), %edx
1953 mov %r11, -17(%rdi)
1954 mov %rcx, -9(%rdi)
1955 mov %edx, -4(%rdi)
1956 ret
1957
1958 .p2align 4
1959L(write_9bytes):
1960 mov -9(%rsi), %rcx
1961 mov -4(%rsi), %edx
1962 mov %rcx, -9(%rdi)
1963 mov %edx, -4(%rdi)
1964 ret
1965
1966 .p2align 4
1967L(write_1bytes):
1968 mov -1(%rsi), %dl
1969 mov %dl, -1(%rdi)
1970 ret
1971
1972 .p2align 4
1973L(write_74bytes):
1974 movdqu -74(%rsi), %xmm0
1975 movdqu -58(%rsi), %xmm1
1976 mov -42(%rsi), %r8
1977 mov -34(%rsi), %r9
1978 mov -26(%rsi), %r10
1979 mov -18(%rsi), %r11
1980 mov -10(%rsi), %rcx
1981 mov -4(%rsi), %edx
1982 movdqu %xmm0, -74(%rdi)
1983 movdqu %xmm1, -58(%rdi)
1984 mov %r8, -42(%rdi)
1985 mov %r9, -34(%rdi)
1986 mov %r10, -26(%rdi)
1987 mov %r11, -18(%rdi)
1988 mov %rcx, -10(%rdi)
1989 mov %edx, -4(%rdi)
1990 ret
1991
1992 .p2align 4
1993L(write_66bytes):
1994 movdqu -66(%rsi), %xmm0
1995 movdqu -50(%rsi), %xmm1
1996 mov -42(%rsi), %r8
1997 mov -34(%rsi), %r9
1998 mov -26(%rsi), %r10
1999 mov -18(%rsi), %r11
2000 mov -10(%rsi), %rcx
2001 mov -4(%rsi), %edx
2002 movdqu %xmm0, -66(%rdi)
2003 movdqu %xmm1, -50(%rdi)
2004 mov %r8, -42(%rdi)
2005 mov %r9, -34(%rdi)
2006 mov %r10, -26(%rdi)
2007 mov %r11, -18(%rdi)
2008 mov %rcx, -10(%rdi)
2009 mov %edx, -4(%rdi)
2010 ret
2011
2012 .p2align 4
2013L(write_58bytes):
2014 movdqu -58(%rsi), %xmm1
2015 mov -42(%rsi), %r8
2016 mov -34(%rsi), %r9
2017 mov -26(%rsi), %r10
2018 mov -18(%rsi), %r11
2019 mov -10(%rsi), %rcx
2020 mov -4(%rsi), %edx
2021 movdqu %xmm1, -58(%rdi)
2022 mov %r8, -42(%rdi)
2023 mov %r9, -34(%rdi)
2024 mov %r10, -26(%rdi)
2025 mov %r11, -18(%rdi)
2026 mov %rcx, -10(%rdi)
2027 mov %edx, -4(%rdi)
2028 ret
2029
2030 .p2align 4
2031L(write_50bytes):
2032 movdqu -50(%rsi), %xmm0
2033 mov -34(%rsi), %r9
2034 mov -26(%rsi), %r10
2035 mov -18(%rsi), %r11
2036 mov -10(%rsi), %rcx
2037 mov -4(%rsi), %edx
2038 movdqu %xmm0, -50(%rdi)
2039 mov %r9, -34(%rdi)
2040 mov %r10, -26(%rdi)
2041 mov %r11, -18(%rdi)
2042 mov %rcx, -10(%rdi)
2043 mov %edx, -4(%rdi)
2044 ret
2045
2046 .p2align 4
2047L(write_42bytes):
2048 mov -42(%rsi), %r8
2049 mov -34(%rsi), %r9
2050 mov -26(%rsi), %r10
2051 mov -18(%rsi), %r11
2052 mov -10(%rsi), %rcx
2053 mov -4(%rsi), %edx
2054 mov %r8, -42(%rdi)
2055 mov %r9, -34(%rdi)
2056 mov %r10, -26(%rdi)
2057 mov %r11, -18(%rdi)
2058 mov %rcx, -10(%rdi)
2059 mov %edx, -4(%rdi)
2060 ret
2061
2062 .p2align 4
2063L(write_34bytes):
2064 mov -34(%rsi), %r9
2065 mov -26(%rsi), %r10
2066 mov -18(%rsi), %r11
2067 mov -10(%rsi), %rcx
2068 mov -4(%rsi), %edx
2069 mov %r9, -34(%rdi)
2070 mov %r10, -26(%rdi)
2071 mov %r11, -18(%rdi)
2072 mov %rcx, -10(%rdi)
2073 mov %edx, -4(%rdi)
2074 ret
2075
2076 .p2align 4
2077L(write_26bytes):
2078 mov -26(%rsi), %r10
2079 mov -18(%rsi), %r11
2080 mov -10(%rsi), %rcx
2081 mov -4(%rsi), %edx
2082 mov %r10, -26(%rdi)
2083 mov %r11, -18(%rdi)
2084 mov %rcx, -10(%rdi)
2085 mov %edx, -4(%rdi)
2086 ret
2087
2088 .p2align 4
2089L(write_18bytes):
2090 mov -18(%rsi), %r11
2091 mov -10(%rsi), %rcx
2092 mov -4(%rsi), %edx
2093 mov %r11, -18(%rdi)
2094 mov %rcx, -10(%rdi)
2095 mov %edx, -4(%rdi)
2096 ret
2097
2098 .p2align 4
2099L(write_10bytes):
2100 mov -10(%rsi), %rcx
2101 mov -4(%rsi), %edx
2102 mov %rcx, -10(%rdi)
2103 mov %edx, -4(%rdi)
2104 ret
2105
2106 .p2align 4
2107L(write_2bytes):
2108 mov -2(%rsi), %dx
2109 mov %dx, -2(%rdi)
2110 ret
2111
2112 .p2align 4
2113L(write_75bytes):
2114 movdqu -75(%rsi), %xmm0
2115 movdqu -59(%rsi), %xmm1
2116 mov -43(%rsi), %r8
2117 mov -35(%rsi), %r9
2118 mov -27(%rsi), %r10
2119 mov -19(%rsi), %r11
2120 mov -11(%rsi), %rcx
2121 mov -4(%rsi), %edx
2122 movdqu %xmm0, -75(%rdi)
2123 movdqu %xmm1, -59(%rdi)
2124 mov %r8, -43(%rdi)
2125 mov %r9, -35(%rdi)
2126 mov %r10, -27(%rdi)
2127 mov %r11, -19(%rdi)
2128 mov %rcx, -11(%rdi)
2129 mov %edx, -4(%rdi)
2130 ret
2131
2132 .p2align 4
2133L(write_67bytes):
2134 movdqu -67(%rsi), %xmm0
2135 movdqu -59(%rsi), %xmm1
2136 mov -43(%rsi), %r8
2137 mov -35(%rsi), %r9
2138 mov -27(%rsi), %r10
2139 mov -19(%rsi), %r11
2140 mov -11(%rsi), %rcx
2141 mov -4(%rsi), %edx
2142 movdqu %xmm0, -67(%rdi)
2143 movdqu %xmm1, -59(%rdi)
2144 mov %r8, -43(%rdi)
2145 mov %r9, -35(%rdi)
2146 mov %r10, -27(%rdi)
2147 mov %r11, -19(%rdi)
2148 mov %rcx, -11(%rdi)
2149 mov %edx, -4(%rdi)
2150 ret
2151
2152 .p2align 4
2153L(write_59bytes):
2154 movdqu -59(%rsi), %xmm0
2155 mov -43(%rsi), %r8
2156 mov -35(%rsi), %r9
2157 mov -27(%rsi), %r10
2158 mov -19(%rsi), %r11
2159 mov -11(%rsi), %rcx
2160 mov -4(%rsi), %edx
2161 movdqu %xmm0, -59(%rdi)
2162 mov %r8, -43(%rdi)
2163 mov %r9, -35(%rdi)
2164 mov %r10, -27(%rdi)
2165 mov %r11, -19(%rdi)
2166 mov %rcx, -11(%rdi)
2167 mov %edx, -4(%rdi)
2168 ret
2169
2170 .p2align 4
2171L(write_51bytes):
2172 movdqu -51(%rsi), %xmm0
2173 mov -35(%rsi), %r9
2174 mov -27(%rsi), %r10
2175 mov -19(%rsi), %r11
2176 mov -11(%rsi), %rcx
2177 mov -4(%rsi), %edx
2178 movdqu %xmm0, -51(%rdi)
2179 mov %r9, -35(%rdi)
2180 mov %r10, -27(%rdi)
2181 mov %r11, -19(%rdi)
2182 mov %rcx, -11(%rdi)
2183 mov %edx, -4(%rdi)
2184 ret
2185
2186 .p2align 4
2187L(write_43bytes):
2188 mov -43(%rsi), %r8
2189 mov -35(%rsi), %r9
2190 mov -27(%rsi), %r10
2191 mov -19(%rsi), %r11
2192 mov -11(%rsi), %rcx
2193 mov -4(%rsi), %edx
2194 mov %r8, -43(%rdi)
2195 mov %r9, -35(%rdi)
2196 mov %r10, -27(%rdi)
2197 mov %r11, -19(%rdi)
2198 mov %rcx, -11(%rdi)
2199 mov %edx, -4(%rdi)
2200 ret
2201
2202 .p2align 4
2203L(write_35bytes):
2204 mov -35(%rsi), %r9
2205 mov -27(%rsi), %r10
2206 mov -19(%rsi), %r11
2207 mov -11(%rsi), %rcx
2208 mov -4(%rsi), %edx
2209 mov %r9, -35(%rdi)
2210 mov %r10, -27(%rdi)
2211 mov %r11, -19(%rdi)
2212 mov %rcx, -11(%rdi)
2213 mov %edx, -4(%rdi)
2214 ret
2215
2216 .p2align 4
2217L(write_27bytes):
2218 mov -27(%rsi), %r10
2219 mov -19(%rsi), %r11
2220 mov -11(%rsi), %rcx
2221 mov -4(%rsi), %edx
2222 mov %r10, -27(%rdi)
2223 mov %r11, -19(%rdi)
2224 mov %rcx, -11(%rdi)
2225 mov %edx, -4(%rdi)
2226 ret
2227
2228 .p2align 4
2229L(write_19bytes):
2230 mov -19(%rsi), %r11
2231 mov -11(%rsi), %rcx
2232 mov -4(%rsi), %edx
2233 mov %r11, -19(%rdi)
2234 mov %rcx, -11(%rdi)
2235 mov %edx, -4(%rdi)
2236 ret
2237
2238 .p2align 4
2239L(write_11bytes):
2240 mov -11(%rsi), %rcx
2241 mov -4(%rsi), %edx
2242 mov %rcx, -11(%rdi)
2243 mov %edx, -4(%rdi)
2244 ret
2245
2246 .p2align 4
2247L(write_3bytes):
2248 mov -3(%rsi), %dx
2249 mov -2(%rsi), %cx
2250 mov %dx, -3(%rdi)
2251 mov %cx, -2(%rdi)
2252 ret
2253
2254 .p2align 4
2255L(write_76bytes):
2256 movdqu -76(%rsi), %xmm0
2257 movdqu -60(%rsi), %xmm1
2258 mov -44(%rsi), %r8
2259 mov -36(%rsi), %r9
2260 mov -28(%rsi), %r10
2261 mov -20(%rsi), %r11
2262 mov -12(%rsi), %rcx
2263 mov -4(%rsi), %edx
2264 movdqu %xmm0, -76(%rdi)
2265 movdqu %xmm1, -60(%rdi)
2266 mov %r8, -44(%rdi)
2267 mov %r9, -36(%rdi)
2268 mov %r10, -28(%rdi)
2269 mov %r11, -20(%rdi)
2270 mov %rcx, -12(%rdi)
2271 mov %edx, -4(%rdi)
2272 ret
2273
2274 .p2align 4
2275L(write_68bytes):
2276 movdqu -68(%rsi), %xmm0
2277 movdqu -52(%rsi), %xmm1
2278 mov -36(%rsi), %r9
2279 mov -28(%rsi), %r10
2280 mov -20(%rsi), %r11
2281 mov -12(%rsi), %rcx
2282 mov -4(%rsi), %edx
2283 movdqu %xmm0, -68(%rdi)
2284 movdqu %xmm1, -52(%rdi)
2285 mov %r9, -36(%rdi)
2286 mov %r10, -28(%rdi)
2287 mov %r11, -20(%rdi)
2288 mov %rcx, -12(%rdi)
2289 mov %edx, -4(%rdi)
2290 ret
2291
2292 .p2align 4
2293L(write_60bytes):
2294 movdqu -60(%rsi), %xmm0
2295 mov -44(%rsi), %r8
2296 mov -36(%rsi), %r9
2297 mov -28(%rsi), %r10
2298 mov -20(%rsi), %r11
2299 mov -12(%rsi), %rcx
2300 mov -4(%rsi), %edx
2301 movdqu %xmm0, -60(%rdi)
2302 mov %r8, -44(%rdi)
2303 mov %r9, -36(%rdi)
2304 mov %r10, -28(%rdi)
2305 mov %r11, -20(%rdi)
2306 mov %rcx, -12(%rdi)
2307 mov %edx, -4(%rdi)
2308 ret
2309
2310 .p2align 4
2311L(write_52bytes):
2312 movdqu -52(%rsi), %xmm0
2313 mov -36(%rsi), %r9
2314 mov -28(%rsi), %r10
2315 mov -20(%rsi), %r11
2316 mov -12(%rsi), %rcx
2317 mov -4(%rsi), %edx
2318 movdqu %xmm0, -52(%rdi)
2319 mov %r9, -36(%rdi)
2320 mov %r10, -28(%rdi)
2321 mov %r11, -20(%rdi)
2322 mov %rcx, -12(%rdi)
2323 mov %edx, -4(%rdi)
2324 ret
2325
2326 .p2align 4
2327L(write_44bytes):
2328 mov -44(%rsi), %r8
2329 mov -36(%rsi), %r9
2330 mov -28(%rsi), %r10
2331 mov -20(%rsi), %r11
2332 mov -12(%rsi), %rcx
2333 mov -4(%rsi), %edx
2334 mov %r8, -44(%rdi)
2335 mov %r9, -36(%rdi)
2336 mov %r10, -28(%rdi)
2337 mov %r11, -20(%rdi)
2338 mov %rcx, -12(%rdi)
2339 mov %edx, -4(%rdi)
2340 ret
2341
2342 .p2align 4
2343L(write_36bytes):
2344 mov -36(%rsi), %r9
2345 mov -28(%rsi), %r10
2346 mov -20(%rsi), %r11
2347 mov -12(%rsi), %rcx
2348 mov -4(%rsi), %edx
2349 mov %r9, -36(%rdi)
2350 mov %r10, -28(%rdi)
2351 mov %r11, -20(%rdi)
2352 mov %rcx, -12(%rdi)
2353 mov %edx, -4(%rdi)
2354 ret
2355
2356 .p2align 4
2357L(write_28bytes):
2358 mov -28(%rsi), %r10
2359 mov -20(%rsi), %r11
2360 mov -12(%rsi), %rcx
2361 mov -4(%rsi), %edx
2362 mov %r10, -28(%rdi)
2363 mov %r11, -20(%rdi)
2364 mov %rcx, -12(%rdi)
2365 mov %edx, -4(%rdi)
2366 ret
2367
2368 .p2align 4
2369L(write_20bytes):
2370 mov -20(%rsi), %r11
2371 mov -12(%rsi), %rcx
2372 mov -4(%rsi), %edx
2373 mov %r11, -20(%rdi)
2374 mov %rcx, -12(%rdi)
2375 mov %edx, -4(%rdi)
2376 ret
2377
2378 .p2align 4
2379L(write_12bytes):
2380 mov -12(%rsi), %rcx
2381 mov -4(%rsi), %edx
2382 mov %rcx, -12(%rdi)
2383 mov %edx, -4(%rdi)
2384 ret
2385
2386 .p2align 4
2387L(write_4bytes):
2388 mov -4(%rsi), %edx
2389 mov %edx, -4(%rdi)
2390 ret
2391
2392 .p2align 4
2393L(write_77bytes):
2394 movdqu -77(%rsi), %xmm0
2395 movdqu -61(%rsi), %xmm1
2396 mov -45(%rsi), %r8
2397 mov -37(%rsi), %r9
2398 mov -29(%rsi), %r10
2399 mov -21(%rsi), %r11
2400 mov -13(%rsi), %rcx
2401 mov -8(%rsi), %rdx
2402 movdqu %xmm0, -77(%rdi)
2403 movdqu %xmm1, -61(%rdi)
2404 mov %r8, -45(%rdi)
2405 mov %r9, -37(%rdi)
2406 mov %r10, -29(%rdi)
2407 mov %r11, -21(%rdi)
2408 mov %rcx, -13(%rdi)
2409 mov %rdx, -8(%rdi)
2410 ret
2411
2412 .p2align 4
2413L(write_69bytes):
2414 movdqu -69(%rsi), %xmm0
2415 movdqu -53(%rsi), %xmm1
2416 mov -37(%rsi), %r9
2417 mov -29(%rsi), %r10
2418 mov -21(%rsi), %r11
2419 mov -13(%rsi), %rcx
2420 mov -8(%rsi), %rdx
2421 movdqu %xmm0, -69(%rdi)
2422 movdqu %xmm1, -53(%rdi)
2423 mov %r9, -37(%rdi)
2424 mov %r10, -29(%rdi)
2425 mov %r11, -21(%rdi)
2426 mov %rcx, -13(%rdi)
2427 mov %rdx, -8(%rdi)
2428 ret
2429
2430 .p2align 4
2431L(write_61bytes):
2432 movdqu -61(%rsi), %xmm0
2433 mov -45(%rsi), %r8
2434 mov -37(%rsi), %r9
2435 mov -29(%rsi), %r10
2436 mov -21(%rsi), %r11
2437 mov -13(%rsi), %rcx
2438 mov -8(%rsi), %rdx
2439 movdqu %xmm0, -61(%rdi)
2440 mov %r8, -45(%rdi)
2441 mov %r9, -37(%rdi)
2442 mov %r10, -29(%rdi)
2443 mov %r11, -21(%rdi)
2444 mov %rcx, -13(%rdi)
2445 mov %rdx, -8(%rdi)
2446 ret
2447
2448 .p2align 4
2449L(write_53bytes):
2450 movdqu -53(%rsi), %xmm0
2451 mov -45(%rsi), %r8
2452 mov -37(%rsi), %r9
2453 mov -29(%rsi), %r10
2454 mov -21(%rsi), %r11
2455 mov -13(%rsi), %rcx
2456 mov -8(%rsi), %rdx
2457 movdqu %xmm0, -53(%rdi)
2458 mov %r9, -37(%rdi)
2459 mov %r10, -29(%rdi)
2460 mov %r11, -21(%rdi)
2461 mov %rcx, -13(%rdi)
2462 mov %rdx, -8(%rdi)
2463 ret
2464
2465 .p2align 4
2466L(write_45bytes):
2467 mov -45(%rsi), %r8
2468 mov -37(%rsi), %r9
2469 mov -29(%rsi), %r10
2470 mov -21(%rsi), %r11
2471 mov -13(%rsi), %rcx
2472 mov -8(%rsi), %rdx
2473 mov %r8, -45(%rdi)
2474 mov %r9, -37(%rdi)
2475 mov %r10, -29(%rdi)
2476 mov %r11, -21(%rdi)
2477 mov %rcx, -13(%rdi)
2478 mov %rdx, -8(%rdi)
2479 ret
2480
2481 .p2align 4
2482L(write_37bytes):
2483 mov -37(%rsi), %r9
2484 mov -29(%rsi), %r10
2485 mov -21(%rsi), %r11
2486 mov -13(%rsi), %rcx
2487 mov -8(%rsi), %rdx
2488 mov %r9, -37(%rdi)
2489 mov %r10, -29(%rdi)
2490 mov %r11, -21(%rdi)
2491 mov %rcx, -13(%rdi)
2492 mov %rdx, -8(%rdi)
2493 ret
2494
2495 .p2align 4
2496L(write_29bytes):
2497 mov -29(%rsi), %r10
2498 mov -21(%rsi), %r11
2499 mov -13(%rsi), %rcx
2500 mov -8(%rsi), %rdx
2501 mov %r10, -29(%rdi)
2502 mov %r11, -21(%rdi)
2503 mov %rcx, -13(%rdi)
2504 mov %rdx, -8(%rdi)
2505 ret
2506
2507 .p2align 4
2508L(write_21bytes):
2509 mov -21(%rsi), %r11
2510 mov -13(%rsi), %rcx
2511 mov -8(%rsi), %rdx
2512 mov %r11, -21(%rdi)
2513 mov %rcx, -13(%rdi)
2514 mov %rdx, -8(%rdi)
2515 ret
2516
2517 .p2align 4
2518L(write_13bytes):
2519 mov -13(%rsi), %rcx
2520 mov -8(%rsi), %rdx
2521 mov %rcx, -13(%rdi)
2522 mov %rdx, -8(%rdi)
2523 ret
2524
2525 .p2align 4
2526L(write_5bytes):
2527 mov -5(%rsi), %edx
2528 mov -4(%rsi), %ecx
2529 mov %edx, -5(%rdi)
2530 mov %ecx, -4(%rdi)
2531 ret
2532
2533 .p2align 4
2534L(write_78bytes):
2535 movdqu -78(%rsi), %xmm0
2536 movdqu -62(%rsi), %xmm1
2537 mov -46(%rsi), %r8
2538 mov -38(%rsi), %r9
2539 mov -30(%rsi), %r10
2540 mov -22(%rsi), %r11
2541 mov -14(%rsi), %rcx
2542 mov -8(%rsi), %rdx
2543 movdqu %xmm0, -78(%rdi)
2544 movdqu %xmm1, -62(%rdi)
2545 mov %r8, -46(%rdi)
2546 mov %r9, -38(%rdi)
2547 mov %r10, -30(%rdi)
2548 mov %r11, -22(%rdi)
2549 mov %rcx, -14(%rdi)
2550 mov %rdx, -8(%rdi)
2551 ret
2552
2553 .p2align 4
2554L(write_70bytes):
2555 movdqu -70(%rsi), %xmm0
2556 movdqu -54(%rsi), %xmm1
2557 mov -38(%rsi), %r9
2558 mov -30(%rsi), %r10
2559 mov -22(%rsi), %r11
2560 mov -14(%rsi), %rcx
2561 mov -8(%rsi), %rdx
2562 movdqu %xmm0, -70(%rdi)
2563 movdqu %xmm1, -54(%rdi)
2564 mov %r9, -38(%rdi)
2565 mov %r10, -30(%rdi)
2566 mov %r11, -22(%rdi)
2567 mov %rcx, -14(%rdi)
2568 mov %rdx, -8(%rdi)
2569 ret
2570
2571 .p2align 4
2572L(write_62bytes):
2573 movdqu -62(%rsi), %xmm0
2574 mov -46(%rsi), %r8
2575 mov -38(%rsi), %r9
2576 mov -30(%rsi), %r10
2577 mov -22(%rsi), %r11
2578 mov -14(%rsi), %rcx
2579 mov -8(%rsi), %rdx
2580 movdqu %xmm0, -62(%rdi)
2581 mov %r8, -46(%rdi)
2582 mov %r9, -38(%rdi)
2583 mov %r10, -30(%rdi)
2584 mov %r11, -22(%rdi)
2585 mov %rcx, -14(%rdi)
2586 mov %rdx, -8(%rdi)
2587 ret
2588
2589 .p2align 4
2590L(write_54bytes):
2591 movdqu -54(%rsi), %xmm0
2592 mov -38(%rsi), %r9
2593 mov -30(%rsi), %r10
2594 mov -22(%rsi), %r11
2595 mov -14(%rsi), %rcx
2596 mov -8(%rsi), %rdx
2597 movdqu %xmm0, -54(%rdi)
2598 mov %r9, -38(%rdi)
2599 mov %r10, -30(%rdi)
2600 mov %r11, -22(%rdi)
2601 mov %rcx, -14(%rdi)
2602 mov %rdx, -8(%rdi)
2603 ret
2604
2605 .p2align 4
2606L(write_46bytes):
2607 mov -46(%rsi), %r8
2608 mov -38(%rsi), %r9
2609 mov -30(%rsi), %r10
2610 mov -22(%rsi), %r11
2611 mov -14(%rsi), %rcx
2612 mov -8(%rsi), %rdx
2613 mov %r8, -46(%rdi)
2614 mov %r9, -38(%rdi)
2615 mov %r10, -30(%rdi)
2616 mov %r11, -22(%rdi)
2617 mov %rcx, -14(%rdi)
2618 mov %rdx, -8(%rdi)
2619 ret
2620
2621 .p2align 4
2622L(write_38bytes):
2623 mov -38(%rsi), %r9
2624 mov -30(%rsi), %r10
2625 mov -22(%rsi), %r11
2626 mov -14(%rsi), %rcx
2627 mov -8(%rsi), %rdx
2628 mov %r9, -38(%rdi)
2629 mov %r10, -30(%rdi)
2630 mov %r11, -22(%rdi)
2631 mov %rcx, -14(%rdi)
2632 mov %rdx, -8(%rdi)
2633 ret
2634
2635 .p2align 4
2636L(write_30bytes):
2637 mov -30(%rsi), %r10
2638 mov -22(%rsi), %r11
2639 mov -14(%rsi), %rcx
2640 mov -8(%rsi), %rdx
2641 mov %r10, -30(%rdi)
2642 mov %r11, -22(%rdi)
2643 mov %rcx, -14(%rdi)
2644 mov %rdx, -8(%rdi)
2645 ret
2646
2647 .p2align 4
2648L(write_22bytes):
2649 mov -22(%rsi), %r11
2650 mov -14(%rsi), %rcx
2651 mov -8(%rsi), %rdx
2652 mov %r11, -22(%rdi)
2653 mov %rcx, -14(%rdi)
2654 mov %rdx, -8(%rdi)
2655 ret
2656
2657 .p2align 4
2658L(write_14bytes):
2659 mov -14(%rsi), %rcx
2660 mov -8(%rsi), %rdx
2661 mov %rcx, -14(%rdi)
2662 mov %rdx, -8(%rdi)
2663 ret
2664
2665 .p2align 4
2666L(write_6bytes):
2667 mov -6(%rsi), %edx
2668 mov -4(%rsi), %ecx
2669 mov %edx, -6(%rdi)
2670 mov %ecx, -4(%rdi)
2671 ret
2672
2673 .p2align 4
2674L(write_79bytes):
2675 movdqu -79(%rsi), %xmm0
2676 movdqu -63(%rsi), %xmm1
2677 mov -47(%rsi), %r8
2678 mov -39(%rsi), %r9
2679 mov -31(%rsi), %r10
2680 mov -23(%rsi), %r11
2681 mov -15(%rsi), %rcx
2682 mov -8(%rsi), %rdx
2683 movdqu %xmm0, -79(%rdi)
2684 movdqu %xmm1, -63(%rdi)
2685 mov %r8, -47(%rdi)
2686 mov %r9, -39(%rdi)
2687 mov %r10, -31(%rdi)
2688 mov %r11, -23(%rdi)
2689 mov %rcx, -15(%rdi)
2690 mov %rdx, -8(%rdi)
2691 ret
2692
2693 .p2align 4
2694L(write_71bytes):
2695 movdqu -71(%rsi), %xmm0
2696 movdqu -55(%rsi), %xmm1
2697 mov -39(%rsi), %r9
2698 mov -31(%rsi), %r10
2699 mov -23(%rsi), %r11
2700 mov -15(%rsi), %rcx
2701 mov -8(%rsi), %rdx
2702 movdqu %xmm0, -71(%rdi)
2703 movdqu %xmm1, -55(%rdi)
2704 mov %r9, -39(%rdi)
2705 mov %r10, -31(%rdi)
2706 mov %r11, -23(%rdi)
2707 mov %rcx, -15(%rdi)
2708 mov %rdx, -8(%rdi)
2709 ret
2710
2711 .p2align 4
2712L(write_63bytes):
2713 movdqu -63(%rsi), %xmm0
2714 mov -47(%rsi), %r8
2715 mov -39(%rsi), %r9
2716 mov -31(%rsi), %r10
2717 mov -23(%rsi), %r11
2718 mov -15(%rsi), %rcx
2719 mov -8(%rsi), %rdx
2720 movdqu %xmm0, -63(%rdi)
2721 mov %r8, -47(%rdi)
2722 mov %r9, -39(%rdi)
2723 mov %r10, -31(%rdi)
2724 mov %r11, -23(%rdi)
2725 mov %rcx, -15(%rdi)
2726 mov %rdx, -8(%rdi)
2727 ret
2728
2729 .p2align 4
2730L(write_55bytes):
2731 movdqu -55(%rsi), %xmm0
2732 mov -39(%rsi), %r9
2733 mov -31(%rsi), %r10
2734 mov -23(%rsi), %r11
2735 mov -15(%rsi), %rcx
2736 mov -8(%rsi), %rdx
2737 movdqu %xmm0, -55(%rdi)
2738 mov %r9, -39(%rdi)
2739 mov %r10, -31(%rdi)
2740 mov %r11, -23(%rdi)
2741 mov %rcx, -15(%rdi)
2742 mov %rdx, -8(%rdi)
2743 ret
2744
2745 .p2align 4
2746L(write_47bytes):
2747 mov -47(%rsi), %r8
2748 mov -39(%rsi), %r9
2749 mov -31(%rsi), %r10
2750 mov -23(%rsi), %r11
2751 mov -15(%rsi), %rcx
2752 mov -8(%rsi), %rdx
2753 mov %r8, -47(%rdi)
2754 mov %r9, -39(%rdi)
2755 mov %r10, -31(%rdi)
2756 mov %r11, -23(%rdi)
2757 mov %rcx, -15(%rdi)
2758 mov %rdx, -8(%rdi)
2759 ret
2760
2761 .p2align 4
2762L(write_39bytes):
2763 mov -39(%rsi), %r9
2764 mov -31(%rsi), %r10
2765 mov -23(%rsi), %r11
2766 mov -15(%rsi), %rcx
2767 mov -8(%rsi), %rdx
2768 mov %r9, -39(%rdi)
2769 mov %r10, -31(%rdi)
2770 mov %r11, -23(%rdi)
2771 mov %rcx, -15(%rdi)
2772 mov %rdx, -8(%rdi)
2773 ret
2774
2775 .p2align 4
2776L(write_31bytes):
2777 mov -31(%rsi), %r10
2778 mov -23(%rsi), %r11
2779 mov -15(%rsi), %rcx
2780 mov -8(%rsi), %rdx
2781 mov %r10, -31(%rdi)
2782 mov %r11, -23(%rdi)
2783 mov %rcx, -15(%rdi)
2784 mov %rdx, -8(%rdi)
2785 ret
2786
2787 .p2align 4
2788L(write_23bytes):
2789 mov -23(%rsi), %r11
2790 mov -15(%rsi), %rcx
2791 mov -8(%rsi), %rdx
2792 mov %r11, -23(%rdi)
2793 mov %rcx, -15(%rdi)
2794 mov %rdx, -8(%rdi)
2795 ret
2796
2797 .p2align 4
2798L(write_15bytes):
2799 mov -15(%rsi), %rcx
2800 mov -8(%rsi), %rdx
2801 mov %rcx, -15(%rdi)
2802 mov %rdx, -8(%rdi)
2803 ret
2804
2805 .p2align 4
2806L(write_7bytes):
2807 mov -7(%rsi), %edx
2808 mov -4(%rsi), %ecx
2809 mov %edx, -7(%rdi)
2810 mov %ecx, -4(%rdi)
2811 ret
2812
2813 .p2align 4
2814L(large_page_fwd):
2815 movdqu (%rsi), %xmm1
2816 lea 16(%rsi), %rsi
2817 movdqu %xmm0, (%r8)
2818 movntdq %xmm1, (%rdi)
2819 lea 16(%rdi), %rdi
2820 lea -0x90(%rdx), %rdx
2821#ifdef USE_AS_MEMMOVE
2822 mov %rsi, %r9
2823 sub %rdi, %r9
2824 cmp %rdx, %r9
2825 jae L(memmove_is_memcpy_fwd)
2826 shl $2, %rcx
2827 cmp %rcx, %rdx
2828 jb L(ll_cache_copy_fwd_start)
2829L(memmove_is_memcpy_fwd):
2830#endif
2831L(large_page_loop):
2832 movdqu (%rsi), %xmm0
2833 movdqu 0x10(%rsi), %xmm1
2834 movdqu 0x20(%rsi), %xmm2
2835 movdqu 0x30(%rsi), %xmm3
2836 movdqu 0x40(%rsi), %xmm4
2837 movdqu 0x50(%rsi), %xmm5
2838 movdqu 0x60(%rsi), %xmm6
2839 movdqu 0x70(%rsi), %xmm7
2840 lea 0x80(%rsi), %rsi
2841
2842 sub $0x80, %rdx
2843 movntdq %xmm0, (%rdi)
2844 movntdq %xmm1, 0x10(%rdi)
2845 movntdq %xmm2, 0x20(%rdi)
2846 movntdq %xmm3, 0x30(%rdi)
2847 movntdq %xmm4, 0x40(%rdi)
2848 movntdq %xmm5, 0x50(%rdi)
2849 movntdq %xmm6, 0x60(%rdi)
2850 movntdq %xmm7, 0x70(%rdi)
2851 lea 0x80(%rdi), %rdi
2852 jae L(large_page_loop)
2853 cmp $-0x40, %rdx
2854 lea 0x80(%rdx), %rdx
2855 jl L(large_page_less_64bytes)
2856
2857 movdqu (%rsi), %xmm0
2858 movdqu 0x10(%rsi), %xmm1
2859 movdqu 0x20(%rsi), %xmm2
2860 movdqu 0x30(%rsi), %xmm3
2861 lea 0x40(%rsi), %rsi
2862
2863 movntdq %xmm0, (%rdi)
2864 movntdq %xmm1, 0x10(%rdi)
2865 movntdq %xmm2, 0x20(%rdi)
2866 movntdq %xmm3, 0x30(%rdi)
2867 lea 0x40(%rdi), %rdi
2868 sub $0x40, %rdx
2869L(large_page_less_64bytes):
2870 add %rdx, %rsi
2871 add %rdx, %rdi
2872 sfence
2873 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2874
2875#ifdef USE_AS_MEMMOVE
2876 .p2align 4
2877L(ll_cache_copy_fwd_start):
2878 prefetcht0 0x1c0(%rsi)
2879 prefetcht0 0x200(%rsi)
2880 movdqu (%rsi), %xmm0
2881 movdqu 0x10(%rsi), %xmm1
2882 movdqu 0x20(%rsi), %xmm2
2883 movdqu 0x30(%rsi), %xmm3
2884 movdqu 0x40(%rsi), %xmm4
2885 movdqu 0x50(%rsi), %xmm5
2886 movdqu 0x60(%rsi), %xmm6
2887 movdqu 0x70(%rsi), %xmm7
2888 lea 0x80(%rsi), %rsi
2889
2890 sub $0x80, %rdx
2891 movaps %xmm0, (%rdi)
2892 movaps %xmm1, 0x10(%rdi)
2893 movaps %xmm2, 0x20(%rdi)
2894 movaps %xmm3, 0x30(%rdi)
2895 movaps %xmm4, 0x40(%rdi)
2896 movaps %xmm5, 0x50(%rdi)
2897 movaps %xmm6, 0x60(%rdi)
2898 movaps %xmm7, 0x70(%rdi)
2899 lea 0x80(%rdi), %rdi
2900 jae L(ll_cache_copy_fwd_start)
2901 cmp $-0x40, %rdx
2902 lea 0x80(%rdx), %rdx
2903 jl L(large_page_ll_less_fwd_64bytes)
2904
2905 movdqu (%rsi), %xmm0
2906 movdqu 0x10(%rsi), %xmm1
2907 movdqu 0x20(%rsi), %xmm2
2908 movdqu 0x30(%rsi), %xmm3
2909 lea 0x40(%rsi), %rsi
2910
2911 movaps %xmm0, (%rdi)
2912 movaps %xmm1, 0x10(%rdi)
2913 movaps %xmm2, 0x20(%rdi)
2914 movaps %xmm3, 0x30(%rdi)
2915 lea 0x40(%rdi), %rdi
2916 sub $0x40, %rdx
2917L(large_page_ll_less_fwd_64bytes):
2918 add %rdx, %rsi
2919 add %rdx, %rdi
2920 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2921
2922#endif
2923 .p2align 4
2924L(large_page_bwd):
2925 movdqu -0x10(%rsi), %xmm1
2926 lea -16(%rsi), %rsi
2927 movdqu %xmm0, (%r8)
2928 movdqa %xmm1, -0x10(%rdi)
2929 lea -16(%rdi), %rdi
2930 lea -0x90(%rdx), %rdx
2931#ifdef USE_AS_MEMMOVE
2932 mov %rdi, %r9
2933 sub %rsi, %r9
2934 cmp %rdx, %r9
2935 jae L(memmove_is_memcpy_bwd)
2936 cmp %rcx, %r9
2937 jb L(ll_cache_copy_bwd_start)
2938L(memmove_is_memcpy_bwd):
2939#endif
2940L(large_page_bwd_loop):
2941 movdqu -0x10(%rsi), %xmm0
2942 movdqu -0x20(%rsi), %xmm1
2943 movdqu -0x30(%rsi), %xmm2
2944 movdqu -0x40(%rsi), %xmm3
2945 movdqu -0x50(%rsi), %xmm4
2946 movdqu -0x60(%rsi), %xmm5
2947 movdqu -0x70(%rsi), %xmm6
2948 movdqu -0x80(%rsi), %xmm7
2949 lea -0x80(%rsi), %rsi
2950
2951 sub $0x80, %rdx
2952 movntdq %xmm0, -0x10(%rdi)
2953 movntdq %xmm1, -0x20(%rdi)
2954 movntdq %xmm2, -0x30(%rdi)
2955 movntdq %xmm3, -0x40(%rdi)
2956 movntdq %xmm4, -0x50(%rdi)
2957 movntdq %xmm5, -0x60(%rdi)
2958 movntdq %xmm6, -0x70(%rdi)
2959 movntdq %xmm7, -0x80(%rdi)
2960 lea -0x80(%rdi), %rdi
2961 jae L(large_page_bwd_loop)
2962 cmp $-0x40, %rdx
2963 lea 0x80(%rdx), %rdx
2964 jl L(large_page_less_bwd_64bytes)
2965
2966 movdqu -0x10(%rsi), %xmm0
2967 movdqu -0x20(%rsi), %xmm1
2968 movdqu -0x30(%rsi), %xmm2
2969 movdqu -0x40(%rsi), %xmm3
2970 lea -0x40(%rsi), %rsi
2971
2972 movntdq %xmm0, -0x10(%rdi)
2973 movntdq %xmm1, -0x20(%rdi)
2974 movntdq %xmm2, -0x30(%rdi)
2975 movntdq %xmm3, -0x40(%rdi)
2976 lea -0x40(%rdi), %rdi
2977 sub $0x40, %rdx
2978L(large_page_less_bwd_64bytes):
2979 sfence
2980 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2981
2982#ifdef USE_AS_MEMMOVE
2983 .p2align 4
2984L(ll_cache_copy_bwd_start):
2985 prefetcht0 -0x1c0(%rsi)
2986 prefetcht0 -0x200(%rsi)
2987 movdqu -0x10(%rsi), %xmm0
2988 movdqu -0x20(%rsi), %xmm1
2989 movdqu -0x30(%rsi), %xmm2
2990 movdqu -0x40(%rsi), %xmm3
2991 movdqu -0x50(%rsi), %xmm4
2992 movdqu -0x60(%rsi), %xmm5
2993 movdqu -0x70(%rsi), %xmm6
2994 movdqu -0x80(%rsi), %xmm7
2995 lea -0x80(%rsi), %rsi
2996
2997 sub $0x80, %rdx
2998 movaps %xmm0, -0x10(%rdi)
2999 movaps %xmm1, -0x20(%rdi)
3000 movaps %xmm2, -0x30(%rdi)
3001 movaps %xmm3, -0x40(%rdi)
3002 movaps %xmm4, -0x50(%rdi)
3003 movaps %xmm5, -0x60(%rdi)
3004 movaps %xmm6, -0x70(%rdi)
3005 movaps %xmm7, -0x80(%rdi)
3006 lea -0x80(%rdi), %rdi
3007 jae L(ll_cache_copy_bwd_start)
3008 cmp $-0x40, %rdx
3009 lea 0x80(%rdx), %rdx
3010 jl L(large_page_ll_less_bwd_64bytes)
3011
3012 movdqu -0x10(%rsi), %xmm0
3013 movdqu -0x20(%rsi), %xmm1
3014 movdqu -0x30(%rsi), %xmm2
3015 movdqu -0x40(%rsi), %xmm3
3016 lea -0x40(%rsi), %rsi
3017
3018 movaps %xmm0, -0x10(%rdi)
3019 movaps %xmm1, -0x20(%rdi)
3020 movaps %xmm2, -0x30(%rdi)
3021 movaps %xmm3, -0x40(%rdi)
3022 lea -0x40(%rdi), %rdi
3023 sub $0x40, %rdx
3024L(large_page_ll_less_bwd_64bytes):
3025 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3026#endif
3027
3028END (MEMCPY)
3029
3030 .section .rodata.ssse3,"a",@progbits
3031 .p2align 3
3032L(table_less_80bytes):
3033 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3034 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3035 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3036 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3095 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3096 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3097 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3099 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3100 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3101 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3102 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3103 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3104 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3105 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3106 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3107 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3108 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3109 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3110 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3111 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3112 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3113
3114 .p2align 3
3115L(shl_table):
3116 .int JMPTBL (L(shl_0), L(shl_table))
3117 .int JMPTBL (L(shl_1), L(shl_table))
3118 .int JMPTBL (L(shl_2), L(shl_table))
3119 .int JMPTBL (L(shl_3), L(shl_table))
3120 .int JMPTBL (L(shl_4), L(shl_table))
3121 .int JMPTBL (L(shl_5), L(shl_table))
3122 .int JMPTBL (L(shl_6), L(shl_table))
3123 .int JMPTBL (L(shl_7), L(shl_table))
3124 .int JMPTBL (L(shl_8), L(shl_table))
3125 .int JMPTBL (L(shl_9), L(shl_table))
3126 .int JMPTBL (L(shl_10), L(shl_table))
3127 .int JMPTBL (L(shl_11), L(shl_table))
3128 .int JMPTBL (L(shl_12), L(shl_table))
3129 .int JMPTBL (L(shl_13), L(shl_table))
3130 .int JMPTBL (L(shl_14), L(shl_table))
3131 .int JMPTBL (L(shl_15), L(shl_table))
3132
3133 .p2align 3
3134L(shl_table_bwd):
3135 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3136 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3137 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3138 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3139 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3140 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3141 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3142 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3143 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3144 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3145 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3146 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3147 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3148 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3149 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3150 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
3151
3152#endif
3153