1/* memcpy with SSSE3
2 Copyright (C) 2010-2017 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22#if IS_IN (libc) \
23 && (defined SHARED \
24 || defined USE_AS_MEMMOVE \
25 || !defined USE_MULTIARCH)
26
27#include "asm-syntax.h"
28
29#ifndef MEMCPY
30# define MEMCPY __memcpy_ssse3
31# define MEMCPY_CHK __memcpy_chk_ssse3
32# define MEMPCPY __mempcpy_ssse3
33# define MEMPCPY_CHK __mempcpy_chk_ssse3
34#endif
35
36#define JMPTBL(I, B) I - B
37
38/* Branch to an entry in a jump table. TABLE is a jump table with
39 relative offsets. INDEX is a register contains the index into the
40 jump table. SCALE is the scale of INDEX. */
41#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
42 lea TABLE(%rip), %r11; \
43 movslq (%r11, INDEX, SCALE), INDEX; \
44 lea (%r11, INDEX), INDEX; \
45 jmp *INDEX; \
46 ud2
47
48 .section .text.ssse3,"ax",@progbits
49#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
50ENTRY (MEMPCPY_CHK)
51 cmp %RDX_LP, %RCX_LP
52 jb HIDDEN_JUMPTARGET (__chk_fail)
53END (MEMPCPY_CHK)
54
55ENTRY (MEMPCPY)
56 mov %RDI_LP, %RAX_LP
57 add %RDX_LP, %RAX_LP
58 jmp L(start)
59END (MEMPCPY)
60#endif
61
62#if !defined USE_AS_BCOPY
63ENTRY (MEMCPY_CHK)
64 cmp %RDX_LP, %RCX_LP
65 jb HIDDEN_JUMPTARGET (__chk_fail)
66END (MEMCPY_CHK)
67#endif
68
69ENTRY (MEMCPY)
70 mov %RDI_LP, %RAX_LP
71#ifdef USE_AS_MEMPCPY
72 add %RDX_LP, %RAX_LP
73#endif
74
75#ifdef __ILP32__
76 /* Clear the upper 32 bits. */
77 mov %edx, %edx
78#endif
79
80#ifdef USE_AS_MEMMOVE
81 cmp %rsi, %rdi
82 jb L(copy_forward)
83 je L(write_0bytes)
84 cmp $79, %rdx
85 jbe L(copy_forward)
86 jmp L(copy_backward)
87L(copy_forward):
88#endif
89L(start):
90 cmp $79, %rdx
91 lea L(table_less_80bytes)(%rip), %r11
92 ja L(80bytesormore)
93 movslq (%r11, %rdx, 4), %r9
94 add %rdx, %rsi
95 add %rdx, %rdi
96 add %r11, %r9
97 jmp *%r9
98 ud2
99
100 .p2align 4
101L(80bytesormore):
102#ifndef USE_AS_MEMMOVE
103 cmp %dil, %sil
104 jle L(copy_backward)
105#endif
106
107 movdqu (%rsi), %xmm0
108 mov %rdi, %rcx
109 and $-16, %rdi
110 add $16, %rdi
111 mov %rcx, %r8
112 sub %rdi, %rcx
113 add %rcx, %rdx
114 sub %rcx, %rsi
115
116#ifdef SHARED_CACHE_SIZE_HALF
117 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
118#else
119 mov __x86_shared_cache_size_half(%rip), %RCX_LP
120#endif
121 cmp %rcx, %rdx
122 mov %rsi, %r9
123 ja L(large_page_fwd)
124 and $0xf, %r9
125 jz L(shl_0)
126#ifdef DATA_CACHE_SIZE_HALF
127 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
128#else
129 mov __x86_data_cache_size_half(%rip), %RCX_LP
130#endif
131 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
132
133 .p2align 4
134L(copy_backward):
135 movdqu -16(%rsi, %rdx), %xmm0
136 add %rdx, %rsi
137 lea -16(%rdi, %rdx), %r8
138 add %rdx, %rdi
139
140 mov %rdi, %rcx
141 and $0xf, %rcx
142 xor %rcx, %rdi
143 sub %rcx, %rdx
144 sub %rcx, %rsi
145
146#ifdef SHARED_CACHE_SIZE_HALF
147 mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
148#else
149 mov __x86_shared_cache_size_half(%rip), %RCX_LP
150#endif
151
152 cmp %rcx, %rdx
153 mov %rsi, %r9
154 ja L(large_page_bwd)
155 and $0xf, %r9
156 jz L(shl_0_bwd)
157#ifdef DATA_CACHE_SIZE_HALF
158 mov $DATA_CACHE_SIZE_HALF, %RCX_LP
159#else
160 mov __x86_data_cache_size_half(%rip), %RCX_LP
161#endif
162 BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
163
164 .p2align 4
165L(shl_0):
166 sub $16, %rdx
167 movdqa (%rsi), %xmm1
168 add $16, %rsi
169 movdqa %xmm1, (%rdi)
170 add $16, %rdi
171 cmp $128, %rdx
172 movdqu %xmm0, (%r8)
173 ja L(shl_0_gobble)
174 cmp $64, %rdx
175 jb L(shl_0_less_64bytes)
176 movaps (%rsi), %xmm4
177 movaps 16(%rsi), %xmm1
178 movaps 32(%rsi), %xmm2
179 movaps 48(%rsi), %xmm3
180 movaps %xmm4, (%rdi)
181 movaps %xmm1, 16(%rdi)
182 movaps %xmm2, 32(%rdi)
183 movaps %xmm3, 48(%rdi)
184 sub $64, %rdx
185 add $64, %rsi
186 add $64, %rdi
187L(shl_0_less_64bytes):
188 add %rdx, %rsi
189 add %rdx, %rdi
190 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
191
192 .p2align 4
193L(shl_0_gobble):
194#ifdef DATA_CACHE_SIZE_HALF
195 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
196#else
197 cmp __x86_data_cache_size_half(%rip), %RDX_LP
198#endif
199 lea -128(%rdx), %rdx
200 jae L(shl_0_gobble_mem_loop)
201L(shl_0_gobble_cache_loop):
202 movdqa (%rsi), %xmm4
203 movaps 0x10(%rsi), %xmm1
204 movaps 0x20(%rsi), %xmm2
205 movaps 0x30(%rsi), %xmm3
206
207 movdqa %xmm4, (%rdi)
208 movaps %xmm1, 0x10(%rdi)
209 movaps %xmm2, 0x20(%rdi)
210 movaps %xmm3, 0x30(%rdi)
211
212 sub $128, %rdx
213 movaps 0x40(%rsi), %xmm4
214 movaps 0x50(%rsi), %xmm5
215 movaps 0x60(%rsi), %xmm6
216 movaps 0x70(%rsi), %xmm7
217 lea 0x80(%rsi), %rsi
218 movaps %xmm4, 0x40(%rdi)
219 movaps %xmm5, 0x50(%rdi)
220 movaps %xmm6, 0x60(%rdi)
221 movaps %xmm7, 0x70(%rdi)
222 lea 0x80(%rdi), %rdi
223
224 jae L(shl_0_gobble_cache_loop)
225 cmp $-0x40, %rdx
226 lea 0x80(%rdx), %rdx
227 jl L(shl_0_cache_less_64bytes)
228
229 movdqa (%rsi), %xmm4
230 sub $0x40, %rdx
231 movdqa 0x10(%rsi), %xmm1
232
233 movdqa %xmm4, (%rdi)
234 movdqa %xmm1, 0x10(%rdi)
235
236 movdqa 0x20(%rsi), %xmm4
237 movdqa 0x30(%rsi), %xmm1
238 add $0x40, %rsi
239
240 movdqa %xmm4, 0x20(%rdi)
241 movdqa %xmm1, 0x30(%rdi)
242 add $0x40, %rdi
243L(shl_0_cache_less_64bytes):
244 add %rdx, %rsi
245 add %rdx, %rdi
246 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
247
248 .p2align 4
249L(shl_0_gobble_mem_loop):
250 prefetcht0 0x1c0(%rsi)
251 prefetcht0 0x280(%rsi)
252
253 movdqa (%rsi), %xmm0
254 movdqa 0x10(%rsi), %xmm1
255 movdqa 0x20(%rsi), %xmm2
256 movdqa 0x30(%rsi), %xmm3
257 movdqa 0x40(%rsi), %xmm4
258 movdqa 0x50(%rsi), %xmm5
259 movdqa 0x60(%rsi), %xmm6
260 movdqa 0x70(%rsi), %xmm7
261 lea 0x80(%rsi), %rsi
262 sub $0x80, %rdx
263 movdqa %xmm0, (%rdi)
264 movdqa %xmm1, 0x10(%rdi)
265 movdqa %xmm2, 0x20(%rdi)
266 movdqa %xmm3, 0x30(%rdi)
267 movdqa %xmm4, 0x40(%rdi)
268 movdqa %xmm5, 0x50(%rdi)
269 movdqa %xmm6, 0x60(%rdi)
270 movdqa %xmm7, 0x70(%rdi)
271 lea 0x80(%rdi), %rdi
272
273 jae L(shl_0_gobble_mem_loop)
274 cmp $-0x40, %rdx
275 lea 0x80(%rdx), %rdx
276 jl L(shl_0_mem_less_64bytes)
277
278 movdqa (%rsi), %xmm0
279 sub $0x40, %rdx
280 movdqa 0x10(%rsi), %xmm1
281
282 movdqa %xmm0, (%rdi)
283 movdqa %xmm1, 0x10(%rdi)
284
285 movdqa 0x20(%rsi), %xmm0
286 movdqa 0x30(%rsi), %xmm1
287 add $0x40, %rsi
288
289 movdqa %xmm0, 0x20(%rdi)
290 movdqa %xmm1, 0x30(%rdi)
291 add $0x40, %rdi
292L(shl_0_mem_less_64bytes):
293 cmp $0x20, %rdx
294 jb L(shl_0_mem_less_32bytes)
295 movdqa (%rsi), %xmm0
296 sub $0x20, %rdx
297 movdqa 0x10(%rsi), %xmm1
298 add $0x20, %rsi
299 movdqa %xmm0, (%rdi)
300 movdqa %xmm1, 0x10(%rdi)
301 add $0x20, %rdi
302L(shl_0_mem_less_32bytes):
303 add %rdx, %rdi
304 add %rdx, %rsi
305 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
306
307 .p2align 4
308L(shl_0_bwd):
309 sub $16, %rdx
310 movdqa -0x10(%rsi), %xmm1
311 sub $16, %rsi
312 movdqa %xmm1, -0x10(%rdi)
313 sub $16, %rdi
314 cmp $0x80, %rdx
315 movdqu %xmm0, (%r8)
316 ja L(shl_0_gobble_bwd)
317 cmp $64, %rdx
318 jb L(shl_0_less_64bytes_bwd)
319 movaps -0x10(%rsi), %xmm0
320 movaps -0x20(%rsi), %xmm1
321 movaps -0x30(%rsi), %xmm2
322 movaps -0x40(%rsi), %xmm3
323 movaps %xmm0, -0x10(%rdi)
324 movaps %xmm1, -0x20(%rdi)
325 movaps %xmm2, -0x30(%rdi)
326 movaps %xmm3, -0x40(%rdi)
327 sub $64, %rdx
328 sub $0x40, %rsi
329 sub $0x40, %rdi
330L(shl_0_less_64bytes_bwd):
331 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
332
333 .p2align 4
334L(shl_0_gobble_bwd):
335#ifdef DATA_CACHE_SIZE_HALF
336 cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
337#else
338 cmp __x86_data_cache_size_half(%rip), %RDX_LP
339#endif
340 lea -128(%rdx), %rdx
341 jae L(shl_0_gobble_mem_bwd_loop)
342L(shl_0_gobble_bwd_loop):
343 movdqa -0x10(%rsi), %xmm0
344 movaps -0x20(%rsi), %xmm1
345 movaps -0x30(%rsi), %xmm2
346 movaps -0x40(%rsi), %xmm3
347
348 movdqa %xmm0, -0x10(%rdi)
349 movaps %xmm1, -0x20(%rdi)
350 movaps %xmm2, -0x30(%rdi)
351 movaps %xmm3, -0x40(%rdi)
352
353 sub $0x80, %rdx
354 movaps -0x50(%rsi), %xmm4
355 movaps -0x60(%rsi), %xmm5
356 movaps -0x70(%rsi), %xmm6
357 movaps -0x80(%rsi), %xmm7
358 lea -0x80(%rsi), %rsi
359 movaps %xmm4, -0x50(%rdi)
360 movaps %xmm5, -0x60(%rdi)
361 movaps %xmm6, -0x70(%rdi)
362 movaps %xmm7, -0x80(%rdi)
363 lea -0x80(%rdi), %rdi
364
365 jae L(shl_0_gobble_bwd_loop)
366 cmp $-0x40, %rdx
367 lea 0x80(%rdx), %rdx
368 jl L(shl_0_gobble_bwd_less_64bytes)
369
370 movdqa -0x10(%rsi), %xmm0
371 sub $0x40, %rdx
372 movdqa -0x20(%rsi), %xmm1
373
374 movdqa %xmm0, -0x10(%rdi)
375 movdqa %xmm1, -0x20(%rdi)
376
377 movdqa -0x30(%rsi), %xmm0
378 movdqa -0x40(%rsi), %xmm1
379 sub $0x40, %rsi
380
381 movdqa %xmm0, -0x30(%rdi)
382 movdqa %xmm1, -0x40(%rdi)
383 sub $0x40, %rdi
384L(shl_0_gobble_bwd_less_64bytes):
385 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
386
387 .p2align 4
388L(shl_0_gobble_mem_bwd_loop):
389 prefetcht0 -0x1c0(%rsi)
390 prefetcht0 -0x280(%rsi)
391 movdqa -0x10(%rsi), %xmm0
392 movdqa -0x20(%rsi), %xmm1
393 movdqa -0x30(%rsi), %xmm2
394 movdqa -0x40(%rsi), %xmm3
395 movdqa -0x50(%rsi), %xmm4
396 movdqa -0x60(%rsi), %xmm5
397 movdqa -0x70(%rsi), %xmm6
398 movdqa -0x80(%rsi), %xmm7
399 lea -0x80(%rsi), %rsi
400 sub $0x80, %rdx
401 movdqa %xmm0, -0x10(%rdi)
402 movdqa %xmm1, -0x20(%rdi)
403 movdqa %xmm2, -0x30(%rdi)
404 movdqa %xmm3, -0x40(%rdi)
405 movdqa %xmm4, -0x50(%rdi)
406 movdqa %xmm5, -0x60(%rdi)
407 movdqa %xmm6, -0x70(%rdi)
408 movdqa %xmm7, -0x80(%rdi)
409 lea -0x80(%rdi), %rdi
410
411 jae L(shl_0_gobble_mem_bwd_loop)
412 cmp $-0x40, %rdx
413 lea 0x80(%rdx), %rdx
414 jl L(shl_0_mem_bwd_less_64bytes)
415
416 movdqa -0x10(%rsi), %xmm0
417 sub $0x40, %rdx
418 movdqa -0x20(%rsi), %xmm1
419
420 movdqa %xmm0, -0x10(%rdi)
421 movdqa %xmm1, -0x20(%rdi)
422
423 movdqa -0x30(%rsi), %xmm0
424 movdqa -0x40(%rsi), %xmm1
425 sub $0x40, %rsi
426
427 movdqa %xmm0, -0x30(%rdi)
428 movdqa %xmm1, -0x40(%rdi)
429 sub $0x40, %rdi
430L(shl_0_mem_bwd_less_64bytes):
431 cmp $0x20, %rdx
432 jb L(shl_0_mem_bwd_less_32bytes)
433 movdqa -0x10(%rsi), %xmm0
434 sub $0x20, %rdx
435 movdqa -0x20(%rsi), %xmm1
436 sub $0x20, %rsi
437 movdqa %xmm0, -0x10(%rdi)
438 movdqa %xmm1, -0x20(%rdi)
439 sub $0x20, %rdi
440L(shl_0_mem_bwd_less_32bytes):
441 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
442
443 .p2align 4
444L(shl_1):
445 lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
446 cmp %rcx, %rdx
447 movaps -0x01(%rsi), %xmm1
448 jb L(L1_fwd)
449 lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
450L(L1_fwd):
451 lea -64(%rdx), %rdx
452 jmp *%r9
453 ud2
454L(shl_1_loop_L2):
455 prefetchnta 0x1c0(%rsi)
456L(shl_1_loop_L1):
457 sub $64, %rdx
458 movaps 0x0f(%rsi), %xmm2
459 movaps 0x1f(%rsi), %xmm3
460 movaps 0x2f(%rsi), %xmm4
461 movaps 0x3f(%rsi), %xmm5
462 movdqa %xmm5, %xmm6
463 palignr $1, %xmm4, %xmm5
464 lea 64(%rsi), %rsi
465 palignr $1, %xmm3, %xmm4
466 palignr $1, %xmm2, %xmm3
467 lea 64(%rdi), %rdi
468 palignr $1, %xmm1, %xmm2
469 movdqa %xmm6, %xmm1
470 movdqa %xmm2, -0x40(%rdi)
471 movaps %xmm3, -0x30(%rdi)
472 jb L(shl_1_end)
473 movaps %xmm4, -0x20(%rdi)
474 movaps %xmm5, -0x10(%rdi)
475 jmp *%r9
476 ud2
477L(shl_1_end):
478 movaps %xmm4, -0x20(%rdi)
479 lea 64(%rdx), %rdx
480 movaps %xmm5, -0x10(%rdi)
481 add %rdx, %rdi
482 movdqu %xmm0, (%r8)
483 add %rdx, %rsi
484 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
485
486 .p2align 4
487L(shl_1_bwd):
488 lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
489 cmp %rcx, %rdx
490 movaps -0x01(%rsi), %xmm1
491 jb L(L1_bwd)
492 lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
493L(L1_bwd):
494 lea -64(%rdx), %rdx
495 jmp *%r9
496 ud2
497L(shl_1_bwd_loop_L2):
498 prefetchnta -0x1c0(%rsi)
499L(shl_1_bwd_loop_L1):
500 movaps -0x11(%rsi), %xmm2
501 sub $0x40, %rdx
502 movaps -0x21(%rsi), %xmm3
503 movaps -0x31(%rsi), %xmm4
504 movaps -0x41(%rsi), %xmm5
505 lea -0x40(%rsi), %rsi
506 palignr $1, %xmm2, %xmm1
507 palignr $1, %xmm3, %xmm2
508 palignr $1, %xmm4, %xmm3
509 palignr $1, %xmm5, %xmm4
510
511 movaps %xmm1, -0x10(%rdi)
512 movaps %xmm5, %xmm1
513
514 movaps %xmm2, -0x20(%rdi)
515 lea -0x40(%rdi), %rdi
516
517 movaps %xmm3, 0x10(%rdi)
518 jb L(shl_1_bwd_end)
519 movaps %xmm4, (%rdi)
520 jmp *%r9
521 ud2
522L(shl_1_bwd_end):
523 movaps %xmm4, (%rdi)
524 lea 64(%rdx), %rdx
525 movdqu %xmm0, (%r8)
526 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
527
528 .p2align 4
529L(shl_2):
530 lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
531 cmp %rcx, %rdx
532 movaps -0x02(%rsi), %xmm1
533 jb L(L2_fwd)
534 lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
535L(L2_fwd):
536 lea -64(%rdx), %rdx
537 jmp *%r9
538 ud2
539L(shl_2_loop_L2):
540 prefetchnta 0x1c0(%rsi)
541L(shl_2_loop_L1):
542 sub $64, %rdx
543 movaps 0x0e(%rsi), %xmm2
544 movaps 0x1e(%rsi), %xmm3
545 movaps 0x2e(%rsi), %xmm4
546 movaps 0x3e(%rsi), %xmm5
547 movdqa %xmm5, %xmm6
548 palignr $2, %xmm4, %xmm5
549 lea 64(%rsi), %rsi
550 palignr $2, %xmm3, %xmm4
551 palignr $2, %xmm2, %xmm3
552 lea 64(%rdi), %rdi
553 palignr $2, %xmm1, %xmm2
554 movdqa %xmm6, %xmm1
555 movdqa %xmm2, -0x40(%rdi)
556 movaps %xmm3, -0x30(%rdi)
557 jb L(shl_2_end)
558 movaps %xmm4, -0x20(%rdi)
559 movaps %xmm5, -0x10(%rdi)
560 jmp *%r9
561 ud2
562L(shl_2_end):
563 movaps %xmm4, -0x20(%rdi)
564 lea 64(%rdx), %rdx
565 movaps %xmm5, -0x10(%rdi)
566 add %rdx, %rdi
567 movdqu %xmm0, (%r8)
568 add %rdx, %rsi
569 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
570
571 .p2align 4
572L(shl_2_bwd):
573 lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
574 cmp %rcx, %rdx
575 movaps -0x02(%rsi), %xmm1
576 jb L(L2_bwd)
577 lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
578L(L2_bwd):
579 lea -64(%rdx), %rdx
580 jmp *%r9
581 ud2
582L(shl_2_bwd_loop_L2):
583 prefetchnta -0x1c0(%rsi)
584L(shl_2_bwd_loop_L1):
585 movaps -0x12(%rsi), %xmm2
586 sub $0x40, %rdx
587 movaps -0x22(%rsi), %xmm3
588 movaps -0x32(%rsi), %xmm4
589 movaps -0x42(%rsi), %xmm5
590 lea -0x40(%rsi), %rsi
591 palignr $2, %xmm2, %xmm1
592 palignr $2, %xmm3, %xmm2
593 palignr $2, %xmm4, %xmm3
594 palignr $2, %xmm5, %xmm4
595
596 movaps %xmm1, -0x10(%rdi)
597 movaps %xmm5, %xmm1
598
599 movaps %xmm2, -0x20(%rdi)
600 lea -0x40(%rdi), %rdi
601
602 movaps %xmm3, 0x10(%rdi)
603 jb L(shl_2_bwd_end)
604 movaps %xmm4, (%rdi)
605 jmp *%r9
606 ud2
607L(shl_2_bwd_end):
608 movaps %xmm4, (%rdi)
609 lea 64(%rdx), %rdx
610 movdqu %xmm0, (%r8)
611 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
612
613 .p2align 4
614L(shl_3):
615 lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
616 cmp %rcx, %rdx
617 movaps -0x03(%rsi), %xmm1
618 jb L(L3_fwd)
619 lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
620L(L3_fwd):
621 lea -64(%rdx), %rdx
622 jmp *%r9
623 ud2
624L(shl_3_loop_L2):
625 prefetchnta 0x1c0(%rsi)
626L(shl_3_loop_L1):
627 sub $64, %rdx
628 movaps 0x0d(%rsi), %xmm2
629 movaps 0x1d(%rsi), %xmm3
630 movaps 0x2d(%rsi), %xmm4
631 movaps 0x3d(%rsi), %xmm5
632 movdqa %xmm5, %xmm6
633 palignr $3, %xmm4, %xmm5
634 lea 64(%rsi), %rsi
635 palignr $3, %xmm3, %xmm4
636 palignr $3, %xmm2, %xmm3
637 lea 64(%rdi), %rdi
638 palignr $3, %xmm1, %xmm2
639 movdqa %xmm6, %xmm1
640 movdqa %xmm2, -0x40(%rdi)
641 movaps %xmm3, -0x30(%rdi)
642 jb L(shl_3_end)
643 movaps %xmm4, -0x20(%rdi)
644 movaps %xmm5, -0x10(%rdi)
645 jmp *%r9
646 ud2
647L(shl_3_end):
648 movaps %xmm4, -0x20(%rdi)
649 lea 64(%rdx), %rdx
650 movaps %xmm5, -0x10(%rdi)
651 add %rdx, %rdi
652 movdqu %xmm0, (%r8)
653 add %rdx, %rsi
654 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
655
656 .p2align 4
657L(shl_3_bwd):
658 lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
659 cmp %rcx, %rdx
660 movaps -0x03(%rsi), %xmm1
661 jb L(L3_bwd)
662 lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
663L(L3_bwd):
664 lea -64(%rdx), %rdx
665 jmp *%r9
666 ud2
667L(shl_3_bwd_loop_L2):
668 prefetchnta -0x1c0(%rsi)
669L(shl_3_bwd_loop_L1):
670 movaps -0x13(%rsi), %xmm2
671 sub $0x40, %rdx
672 movaps -0x23(%rsi), %xmm3
673 movaps -0x33(%rsi), %xmm4
674 movaps -0x43(%rsi), %xmm5
675 lea -0x40(%rsi), %rsi
676 palignr $3, %xmm2, %xmm1
677 palignr $3, %xmm3, %xmm2
678 palignr $3, %xmm4, %xmm3
679 palignr $3, %xmm5, %xmm4
680
681 movaps %xmm1, -0x10(%rdi)
682 movaps %xmm5, %xmm1
683
684 movaps %xmm2, -0x20(%rdi)
685 lea -0x40(%rdi), %rdi
686
687 movaps %xmm3, 0x10(%rdi)
688 jb L(shl_3_bwd_end)
689 movaps %xmm4, (%rdi)
690 jmp *%r9
691 ud2
692L(shl_3_bwd_end):
693 movaps %xmm4, (%rdi)
694 lea 64(%rdx), %rdx
695 movdqu %xmm0, (%r8)
696 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
697
698 .p2align 4
699L(shl_4):
700 lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
701 cmp %rcx, %rdx
702 movaps -0x04(%rsi), %xmm1
703 jb L(L4_fwd)
704 lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
705L(L4_fwd):
706 lea -64(%rdx), %rdx
707 jmp *%r9
708 ud2
709L(shl_4_loop_L2):
710 prefetchnta 0x1c0(%rsi)
711L(shl_4_loop_L1):
712 sub $64, %rdx
713 movaps 0x0c(%rsi), %xmm2
714 movaps 0x1c(%rsi), %xmm3
715 movaps 0x2c(%rsi), %xmm4
716 movaps 0x3c(%rsi), %xmm5
717 movdqa %xmm5, %xmm6
718 palignr $4, %xmm4, %xmm5
719 lea 64(%rsi), %rsi
720 palignr $4, %xmm3, %xmm4
721 palignr $4, %xmm2, %xmm3
722 lea 64(%rdi), %rdi
723 palignr $4, %xmm1, %xmm2
724 movdqa %xmm6, %xmm1
725 movdqa %xmm2, -0x40(%rdi)
726 movaps %xmm3, -0x30(%rdi)
727 jb L(shl_4_end)
728 movaps %xmm4, -0x20(%rdi)
729 movaps %xmm5, -0x10(%rdi)
730 jmp *%r9
731 ud2
732L(shl_4_end):
733 movaps %xmm4, -0x20(%rdi)
734 lea 64(%rdx), %rdx
735 movaps %xmm5, -0x10(%rdi)
736 add %rdx, %rdi
737 movdqu %xmm0, (%r8)
738 add %rdx, %rsi
739 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
740
741 .p2align 4
742L(shl_4_bwd):
743 lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
744 cmp %rcx, %rdx
745 movaps -0x04(%rsi), %xmm1
746 jb L(L4_bwd)
747 lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
748L(L4_bwd):
749 lea -64(%rdx), %rdx
750 jmp *%r9
751 ud2
752L(shl_4_bwd_loop_L2):
753 prefetchnta -0x1c0(%rsi)
754L(shl_4_bwd_loop_L1):
755 movaps -0x14(%rsi), %xmm2
756 sub $0x40, %rdx
757 movaps -0x24(%rsi), %xmm3
758 movaps -0x34(%rsi), %xmm4
759 movaps -0x44(%rsi), %xmm5
760 lea -0x40(%rsi), %rsi
761 palignr $4, %xmm2, %xmm1
762 palignr $4, %xmm3, %xmm2
763 palignr $4, %xmm4, %xmm3
764 palignr $4, %xmm5, %xmm4
765
766 movaps %xmm1, -0x10(%rdi)
767 movaps %xmm5, %xmm1
768
769 movaps %xmm2, -0x20(%rdi)
770 lea -0x40(%rdi), %rdi
771
772 movaps %xmm3, 0x10(%rdi)
773 jb L(shl_4_bwd_end)
774 movaps %xmm4, (%rdi)
775 jmp *%r9
776 ud2
777L(shl_4_bwd_end):
778 movaps %xmm4, (%rdi)
779 lea 64(%rdx), %rdx
780 movdqu %xmm0, (%r8)
781 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
782
783 .p2align 4
784L(shl_5):
785 lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
786 cmp %rcx, %rdx
787 movaps -0x05(%rsi), %xmm1
788 jb L(L5_fwd)
789 lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
790L(L5_fwd):
791 lea -64(%rdx), %rdx
792 jmp *%r9
793 ud2
794L(shl_5_loop_L2):
795 prefetchnta 0x1c0(%rsi)
796L(shl_5_loop_L1):
797 sub $64, %rdx
798 movaps 0x0b(%rsi), %xmm2
799 movaps 0x1b(%rsi), %xmm3
800 movaps 0x2b(%rsi), %xmm4
801 movaps 0x3b(%rsi), %xmm5
802 movdqa %xmm5, %xmm6
803 palignr $5, %xmm4, %xmm5
804 lea 64(%rsi), %rsi
805 palignr $5, %xmm3, %xmm4
806 palignr $5, %xmm2, %xmm3
807 lea 64(%rdi), %rdi
808 palignr $5, %xmm1, %xmm2
809 movdqa %xmm6, %xmm1
810 movdqa %xmm2, -0x40(%rdi)
811 movaps %xmm3, -0x30(%rdi)
812 jb L(shl_5_end)
813 movaps %xmm4, -0x20(%rdi)
814 movaps %xmm5, -0x10(%rdi)
815 jmp *%r9
816 ud2
817L(shl_5_end):
818 movaps %xmm4, -0x20(%rdi)
819 lea 64(%rdx), %rdx
820 movaps %xmm5, -0x10(%rdi)
821 add %rdx, %rdi
822 movdqu %xmm0, (%r8)
823 add %rdx, %rsi
824 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
825
826 .p2align 4
827L(shl_5_bwd):
828 lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
829 cmp %rcx, %rdx
830 movaps -0x05(%rsi), %xmm1
831 jb L(L5_bwd)
832 lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
833L(L5_bwd):
834 lea -64(%rdx), %rdx
835 jmp *%r9
836 ud2
837L(shl_5_bwd_loop_L2):
838 prefetchnta -0x1c0(%rsi)
839L(shl_5_bwd_loop_L1):
840 movaps -0x15(%rsi), %xmm2
841 sub $0x40, %rdx
842 movaps -0x25(%rsi), %xmm3
843 movaps -0x35(%rsi), %xmm4
844 movaps -0x45(%rsi), %xmm5
845 lea -0x40(%rsi), %rsi
846 palignr $5, %xmm2, %xmm1
847 palignr $5, %xmm3, %xmm2
848 palignr $5, %xmm4, %xmm3
849 palignr $5, %xmm5, %xmm4
850
851 movaps %xmm1, -0x10(%rdi)
852 movaps %xmm5, %xmm1
853
854 movaps %xmm2, -0x20(%rdi)
855 lea -0x40(%rdi), %rdi
856
857 movaps %xmm3, 0x10(%rdi)
858 jb L(shl_5_bwd_end)
859 movaps %xmm4, (%rdi)
860 jmp *%r9
861 ud2
862L(shl_5_bwd_end):
863 movaps %xmm4, (%rdi)
864 lea 64(%rdx), %rdx
865 movdqu %xmm0, (%r8)
866 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
867
868 .p2align 4
869L(shl_6):
870 lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
871 cmp %rcx, %rdx
872 movaps -0x06(%rsi), %xmm1
873 jb L(L6_fwd)
874 lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
875L(L6_fwd):
876 lea -64(%rdx), %rdx
877 jmp *%r9
878 ud2
879L(shl_6_loop_L2):
880 prefetchnta 0x1c0(%rsi)
881L(shl_6_loop_L1):
882 sub $64, %rdx
883 movaps 0x0a(%rsi), %xmm2
884 movaps 0x1a(%rsi), %xmm3
885 movaps 0x2a(%rsi), %xmm4
886 movaps 0x3a(%rsi), %xmm5
887 movdqa %xmm5, %xmm6
888 palignr $6, %xmm4, %xmm5
889 lea 64(%rsi), %rsi
890 palignr $6, %xmm3, %xmm4
891 palignr $6, %xmm2, %xmm3
892 lea 64(%rdi), %rdi
893 palignr $6, %xmm1, %xmm2
894 movdqa %xmm6, %xmm1
895 movdqa %xmm2, -0x40(%rdi)
896 movaps %xmm3, -0x30(%rdi)
897 jb L(shl_6_end)
898 movaps %xmm4, -0x20(%rdi)
899 movaps %xmm5, -0x10(%rdi)
900 jmp *%r9
901 ud2
902L(shl_6_end):
903 movaps %xmm4, -0x20(%rdi)
904 lea 64(%rdx), %rdx
905 movaps %xmm5, -0x10(%rdi)
906 add %rdx, %rdi
907 movdqu %xmm0, (%r8)
908 add %rdx, %rsi
909 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
910
911 .p2align 4
912L(shl_6_bwd):
913 lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
914 cmp %rcx, %rdx
915 movaps -0x06(%rsi), %xmm1
916 jb L(L6_bwd)
917 lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
918L(L6_bwd):
919 lea -64(%rdx), %rdx
920 jmp *%r9
921 ud2
922L(shl_6_bwd_loop_L2):
923 prefetchnta -0x1c0(%rsi)
924L(shl_6_bwd_loop_L1):
925 movaps -0x16(%rsi), %xmm2
926 sub $0x40, %rdx
927 movaps -0x26(%rsi), %xmm3
928 movaps -0x36(%rsi), %xmm4
929 movaps -0x46(%rsi), %xmm5
930 lea -0x40(%rsi), %rsi
931 palignr $6, %xmm2, %xmm1
932 palignr $6, %xmm3, %xmm2
933 palignr $6, %xmm4, %xmm3
934 palignr $6, %xmm5, %xmm4
935
936 movaps %xmm1, -0x10(%rdi)
937 movaps %xmm5, %xmm1
938
939 movaps %xmm2, -0x20(%rdi)
940 lea -0x40(%rdi), %rdi
941
942 movaps %xmm3, 0x10(%rdi)
943 jb L(shl_6_bwd_end)
944 movaps %xmm4, (%rdi)
945 jmp *%r9
946 ud2
947L(shl_6_bwd_end):
948 movaps %xmm4, (%rdi)
949 lea 64(%rdx), %rdx
950 movdqu %xmm0, (%r8)
951 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
952
953 .p2align 4
954L(shl_7):
955 lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
956 cmp %rcx, %rdx
957 movaps -0x07(%rsi), %xmm1
958 jb L(L7_fwd)
959 lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
960L(L7_fwd):
961 lea -64(%rdx), %rdx
962 jmp *%r9
963 ud2
964L(shl_7_loop_L2):
965 prefetchnta 0x1c0(%rsi)
966L(shl_7_loop_L1):
967 sub $64, %rdx
968 movaps 0x09(%rsi), %xmm2
969 movaps 0x19(%rsi), %xmm3
970 movaps 0x29(%rsi), %xmm4
971 movaps 0x39(%rsi), %xmm5
972 movdqa %xmm5, %xmm6
973 palignr $7, %xmm4, %xmm5
974 lea 64(%rsi), %rsi
975 palignr $7, %xmm3, %xmm4
976 palignr $7, %xmm2, %xmm3
977 lea 64(%rdi), %rdi
978 palignr $7, %xmm1, %xmm2
979 movdqa %xmm6, %xmm1
980 movdqa %xmm2, -0x40(%rdi)
981 movaps %xmm3, -0x30(%rdi)
982 jb L(shl_7_end)
983 movaps %xmm4, -0x20(%rdi)
984 movaps %xmm5, -0x10(%rdi)
985 jmp *%r9
986 ud2
987L(shl_7_end):
988 movaps %xmm4, -0x20(%rdi)
989 lea 64(%rdx), %rdx
990 movaps %xmm5, -0x10(%rdi)
991 add %rdx, %rdi
992 movdqu %xmm0, (%r8)
993 add %rdx, %rsi
994 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
995
996 .p2align 4
997L(shl_7_bwd):
998 lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
999 cmp %rcx, %rdx
1000 movaps -0x07(%rsi), %xmm1
1001 jb L(L7_bwd)
1002 lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
1003L(L7_bwd):
1004 lea -64(%rdx), %rdx
1005 jmp *%r9
1006 ud2
1007L(shl_7_bwd_loop_L2):
1008 prefetchnta -0x1c0(%rsi)
1009L(shl_7_bwd_loop_L1):
1010 movaps -0x17(%rsi), %xmm2
1011 sub $0x40, %rdx
1012 movaps -0x27(%rsi), %xmm3
1013 movaps -0x37(%rsi), %xmm4
1014 movaps -0x47(%rsi), %xmm5
1015 lea -0x40(%rsi), %rsi
1016 palignr $7, %xmm2, %xmm1
1017 palignr $7, %xmm3, %xmm2
1018 palignr $7, %xmm4, %xmm3
1019 palignr $7, %xmm5, %xmm4
1020
1021 movaps %xmm1, -0x10(%rdi)
1022 movaps %xmm5, %xmm1
1023
1024 movaps %xmm2, -0x20(%rdi)
1025 lea -0x40(%rdi), %rdi
1026
1027 movaps %xmm3, 0x10(%rdi)
1028 jb L(shl_7_bwd_end)
1029 movaps %xmm4, (%rdi)
1030 jmp *%r9
1031 ud2
1032L(shl_7_bwd_end):
1033 movaps %xmm4, (%rdi)
1034 lea 64(%rdx), %rdx
1035 movdqu %xmm0, (%r8)
1036 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1037
1038 .p2align 4
1039L(shl_8):
1040 lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
1041 cmp %rcx, %rdx
1042 movaps -0x08(%rsi), %xmm1
1043 jb L(L8_fwd)
1044 lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
1045L(L8_fwd):
1046 lea -64(%rdx), %rdx
1047 jmp *%r9
1048L(shl_8_loop_L2):
1049 prefetchnta 0x1c0(%rsi)
1050L(shl_8_loop_L1):
1051 sub $64, %rdx
1052 movaps 0x08(%rsi), %xmm2
1053 movaps 0x18(%rsi), %xmm3
1054 movaps 0x28(%rsi), %xmm4
1055 movaps 0x38(%rsi), %xmm5
1056 movdqa %xmm5, %xmm6
1057 palignr $8, %xmm4, %xmm5
1058 lea 64(%rsi), %rsi
1059 palignr $8, %xmm3, %xmm4
1060 palignr $8, %xmm2, %xmm3
1061 lea 64(%rdi), %rdi
1062 palignr $8, %xmm1, %xmm2
1063 movdqa %xmm6, %xmm1
1064 movdqa %xmm2, -0x40(%rdi)
1065 movaps %xmm3, -0x30(%rdi)
1066 jb L(shl_8_end)
1067 movaps %xmm4, -0x20(%rdi)
1068 movaps %xmm5, -0x10(%rdi)
1069 jmp *%r9
1070 ud2
1071 .p2align 4
1072L(shl_8_end):
1073 lea 64(%rdx), %rdx
1074 movaps %xmm4, -0x20(%rdi)
1075 add %rdx, %rsi
1076 movaps %xmm5, -0x10(%rdi)
1077 add %rdx, %rdi
1078 movdqu %xmm0, (%r8)
1079 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1080
1081 .p2align 4
1082L(shl_8_bwd):
1083 lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
1084 cmp %rcx, %rdx
1085 movaps -0x08(%rsi), %xmm1
1086 jb L(L8_bwd)
1087 lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
1088L(L8_bwd):
1089 lea -64(%rdx), %rdx
1090 jmp *%r9
1091 ud2
1092L(shl_8_bwd_loop_L2):
1093 prefetchnta -0x1c0(%rsi)
1094L(shl_8_bwd_loop_L1):
1095 movaps -0x18(%rsi), %xmm2
1096 sub $0x40, %rdx
1097 movaps -0x28(%rsi), %xmm3
1098 movaps -0x38(%rsi), %xmm4
1099 movaps -0x48(%rsi), %xmm5
1100 lea -0x40(%rsi), %rsi
1101 palignr $8, %xmm2, %xmm1
1102 palignr $8, %xmm3, %xmm2
1103 palignr $8, %xmm4, %xmm3
1104 palignr $8, %xmm5, %xmm4
1105
1106 movaps %xmm1, -0x10(%rdi)
1107 movaps %xmm5, %xmm1
1108
1109 movaps %xmm2, -0x20(%rdi)
1110 lea -0x40(%rdi), %rdi
1111
1112 movaps %xmm3, 0x10(%rdi)
1113 jb L(shl_8_bwd_end)
1114 movaps %xmm4, (%rdi)
1115 jmp *%r9
1116 ud2
1117L(shl_8_bwd_end):
1118 movaps %xmm4, (%rdi)
1119 lea 64(%rdx), %rdx
1120 movdqu %xmm0, (%r8)
1121 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1122
1123 .p2align 4
1124L(shl_9):
1125 lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
1126 cmp %rcx, %rdx
1127 movaps -0x09(%rsi), %xmm1
1128 jb L(L9_fwd)
1129 lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
1130L(L9_fwd):
1131 lea -64(%rdx), %rdx
1132 jmp *%r9
1133 ud2
1134L(shl_9_loop_L2):
1135 prefetchnta 0x1c0(%rsi)
1136L(shl_9_loop_L1):
1137 sub $64, %rdx
1138 movaps 0x07(%rsi), %xmm2
1139 movaps 0x17(%rsi), %xmm3
1140 movaps 0x27(%rsi), %xmm4
1141 movaps 0x37(%rsi), %xmm5
1142 movdqa %xmm5, %xmm6
1143 palignr $9, %xmm4, %xmm5
1144 lea 64(%rsi), %rsi
1145 palignr $9, %xmm3, %xmm4
1146 palignr $9, %xmm2, %xmm3
1147 lea 64(%rdi), %rdi
1148 palignr $9, %xmm1, %xmm2
1149 movdqa %xmm6, %xmm1
1150 movdqa %xmm2, -0x40(%rdi)
1151 movaps %xmm3, -0x30(%rdi)
1152 jb L(shl_9_end)
1153 movaps %xmm4, -0x20(%rdi)
1154 movaps %xmm5, -0x10(%rdi)
1155 jmp *%r9
1156 ud2
1157L(shl_9_end):
1158 movaps %xmm4, -0x20(%rdi)
1159 lea 64(%rdx), %rdx
1160 movaps %xmm5, -0x10(%rdi)
1161 add %rdx, %rdi
1162 movdqu %xmm0, (%r8)
1163 add %rdx, %rsi
1164 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1165
1166 .p2align 4
1167L(shl_9_bwd):
1168 lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
1169 cmp %rcx, %rdx
1170 movaps -0x09(%rsi), %xmm1
1171 jb L(L9_bwd)
1172 lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
1173L(L9_bwd):
1174 lea -64(%rdx), %rdx
1175 jmp *%r9
1176 ud2
1177L(shl_9_bwd_loop_L2):
1178 prefetchnta -0x1c0(%rsi)
1179L(shl_9_bwd_loop_L1):
1180 movaps -0x19(%rsi), %xmm2
1181 sub $0x40, %rdx
1182 movaps -0x29(%rsi), %xmm3
1183 movaps -0x39(%rsi), %xmm4
1184 movaps -0x49(%rsi), %xmm5
1185 lea -0x40(%rsi), %rsi
1186 palignr $9, %xmm2, %xmm1
1187 palignr $9, %xmm3, %xmm2
1188 palignr $9, %xmm4, %xmm3
1189 palignr $9, %xmm5, %xmm4
1190
1191 movaps %xmm1, -0x10(%rdi)
1192 movaps %xmm5, %xmm1
1193
1194 movaps %xmm2, -0x20(%rdi)
1195 lea -0x40(%rdi), %rdi
1196
1197 movaps %xmm3, 0x10(%rdi)
1198 jb L(shl_9_bwd_end)
1199 movaps %xmm4, (%rdi)
1200 jmp *%r9
1201 ud2
1202L(shl_9_bwd_end):
1203 movaps %xmm4, (%rdi)
1204 lea 64(%rdx), %rdx
1205 movdqu %xmm0, (%r8)
1206 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1207
1208 .p2align 4
1209L(shl_10):
1210 lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
1211 cmp %rcx, %rdx
1212 movaps -0x0a(%rsi), %xmm1
1213 jb L(L10_fwd)
1214 lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
1215L(L10_fwd):
1216 lea -64(%rdx), %rdx
1217 jmp *%r9
1218 ud2
1219L(shl_10_loop_L2):
1220 prefetchnta 0x1c0(%rsi)
1221L(shl_10_loop_L1):
1222 sub $64, %rdx
1223 movaps 0x06(%rsi), %xmm2
1224 movaps 0x16(%rsi), %xmm3
1225 movaps 0x26(%rsi), %xmm4
1226 movaps 0x36(%rsi), %xmm5
1227 movdqa %xmm5, %xmm6
1228 palignr $10, %xmm4, %xmm5
1229 lea 64(%rsi), %rsi
1230 palignr $10, %xmm3, %xmm4
1231 palignr $10, %xmm2, %xmm3
1232 lea 64(%rdi), %rdi
1233 palignr $10, %xmm1, %xmm2
1234 movdqa %xmm6, %xmm1
1235 movdqa %xmm2, -0x40(%rdi)
1236 movaps %xmm3, -0x30(%rdi)
1237 jb L(shl_10_end)
1238 movaps %xmm4, -0x20(%rdi)
1239 movaps %xmm5, -0x10(%rdi)
1240 jmp *%r9
1241 ud2
1242L(shl_10_end):
1243 movaps %xmm4, -0x20(%rdi)
1244 lea 64(%rdx), %rdx
1245 movaps %xmm5, -0x10(%rdi)
1246 add %rdx, %rdi
1247 movdqu %xmm0, (%r8)
1248 add %rdx, %rsi
1249 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1250
1251 .p2align 4
1252L(shl_10_bwd):
1253 lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
1254 cmp %rcx, %rdx
1255 movaps -0x0a(%rsi), %xmm1
1256 jb L(L10_bwd)
1257 lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
1258L(L10_bwd):
1259 lea -64(%rdx), %rdx
1260 jmp *%r9
1261 ud2
1262L(shl_10_bwd_loop_L2):
1263 prefetchnta -0x1c0(%rsi)
1264L(shl_10_bwd_loop_L1):
1265 movaps -0x1a(%rsi), %xmm2
1266 sub $0x40, %rdx
1267 movaps -0x2a(%rsi), %xmm3
1268 movaps -0x3a(%rsi), %xmm4
1269 movaps -0x4a(%rsi), %xmm5
1270 lea -0x40(%rsi), %rsi
1271 palignr $10, %xmm2, %xmm1
1272 palignr $10, %xmm3, %xmm2
1273 palignr $10, %xmm4, %xmm3
1274 palignr $10, %xmm5, %xmm4
1275
1276 movaps %xmm1, -0x10(%rdi)
1277 movaps %xmm5, %xmm1
1278
1279 movaps %xmm2, -0x20(%rdi)
1280 lea -0x40(%rdi), %rdi
1281
1282 movaps %xmm3, 0x10(%rdi)
1283 jb L(shl_10_bwd_end)
1284 movaps %xmm4, (%rdi)
1285 jmp *%r9
1286 ud2
1287L(shl_10_bwd_end):
1288 movaps %xmm4, (%rdi)
1289 lea 64(%rdx), %rdx
1290 movdqu %xmm0, (%r8)
1291 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1292
1293 .p2align 4
1294L(shl_11):
1295 lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
1296 cmp %rcx, %rdx
1297 movaps -0x0b(%rsi), %xmm1
1298 jb L(L11_fwd)
1299 lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
1300L(L11_fwd):
1301 lea -64(%rdx), %rdx
1302 jmp *%r9
1303 ud2
1304L(shl_11_loop_L2):
1305 prefetchnta 0x1c0(%rsi)
1306L(shl_11_loop_L1):
1307 sub $64, %rdx
1308 movaps 0x05(%rsi), %xmm2
1309 movaps 0x15(%rsi), %xmm3
1310 movaps 0x25(%rsi), %xmm4
1311 movaps 0x35(%rsi), %xmm5
1312 movdqa %xmm5, %xmm6
1313 palignr $11, %xmm4, %xmm5
1314 lea 64(%rsi), %rsi
1315 palignr $11, %xmm3, %xmm4
1316 palignr $11, %xmm2, %xmm3
1317 lea 64(%rdi), %rdi
1318 palignr $11, %xmm1, %xmm2
1319 movdqa %xmm6, %xmm1
1320 movdqa %xmm2, -0x40(%rdi)
1321 movaps %xmm3, -0x30(%rdi)
1322 jb L(shl_11_end)
1323 movaps %xmm4, -0x20(%rdi)
1324 movaps %xmm5, -0x10(%rdi)
1325 jmp *%r9
1326 ud2
1327L(shl_11_end):
1328 movaps %xmm4, -0x20(%rdi)
1329 lea 64(%rdx), %rdx
1330 movaps %xmm5, -0x10(%rdi)
1331 add %rdx, %rdi
1332 movdqu %xmm0, (%r8)
1333 add %rdx, %rsi
1334 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1335
1336 .p2align 4
1337L(shl_11_bwd):
1338 lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
1339 cmp %rcx, %rdx
1340 movaps -0x0b(%rsi), %xmm1
1341 jb L(L11_bwd)
1342 lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
1343L(L11_bwd):
1344 lea -64(%rdx), %rdx
1345 jmp *%r9
1346 ud2
1347L(shl_11_bwd_loop_L2):
1348 prefetchnta -0x1c0(%rsi)
1349L(shl_11_bwd_loop_L1):
1350 movaps -0x1b(%rsi), %xmm2
1351 sub $0x40, %rdx
1352 movaps -0x2b(%rsi), %xmm3
1353 movaps -0x3b(%rsi), %xmm4
1354 movaps -0x4b(%rsi), %xmm5
1355 lea -0x40(%rsi), %rsi
1356 palignr $11, %xmm2, %xmm1
1357 palignr $11, %xmm3, %xmm2
1358 palignr $11, %xmm4, %xmm3
1359 palignr $11, %xmm5, %xmm4
1360
1361 movaps %xmm1, -0x10(%rdi)
1362 movaps %xmm5, %xmm1
1363
1364 movaps %xmm2, -0x20(%rdi)
1365 lea -0x40(%rdi), %rdi
1366
1367 movaps %xmm3, 0x10(%rdi)
1368 jb L(shl_11_bwd_end)
1369 movaps %xmm4, (%rdi)
1370 jmp *%r9
1371 ud2
1372L(shl_11_bwd_end):
1373 movaps %xmm4, (%rdi)
1374 lea 64(%rdx), %rdx
1375 movdqu %xmm0, (%r8)
1376 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1377
1378 .p2align 4
1379L(shl_12):
1380 lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
1381 cmp %rcx, %rdx
1382 movaps -0x0c(%rsi), %xmm1
1383 jb L(L12_fwd)
1384 lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
1385L(L12_fwd):
1386 lea -64(%rdx), %rdx
1387 jmp *%r9
1388 ud2
1389L(shl_12_loop_L2):
1390 prefetchnta 0x1c0(%rsi)
1391L(shl_12_loop_L1):
1392 sub $64, %rdx
1393 movaps 0x04(%rsi), %xmm2
1394 movaps 0x14(%rsi), %xmm3
1395 movaps 0x24(%rsi), %xmm4
1396 movaps 0x34(%rsi), %xmm5
1397 movdqa %xmm5, %xmm6
1398 palignr $12, %xmm4, %xmm5
1399 lea 64(%rsi), %rsi
1400 palignr $12, %xmm3, %xmm4
1401 palignr $12, %xmm2, %xmm3
1402 lea 64(%rdi), %rdi
1403 palignr $12, %xmm1, %xmm2
1404 movdqa %xmm6, %xmm1
1405 movdqa %xmm2, -0x40(%rdi)
1406 movaps %xmm3, -0x30(%rdi)
1407 jb L(shl_12_end)
1408 movaps %xmm4, -0x20(%rdi)
1409 movaps %xmm5, -0x10(%rdi)
1410 jmp *%r9
1411 ud2
1412L(shl_12_end):
1413 movaps %xmm4, -0x20(%rdi)
1414 lea 64(%rdx), %rdx
1415 movaps %xmm5, -0x10(%rdi)
1416 add %rdx, %rdi
1417 movdqu %xmm0, (%r8)
1418 add %rdx, %rsi
1419 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1420
1421 .p2align 4
1422L(shl_12_bwd):
1423 lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
1424 cmp %rcx, %rdx
1425 movaps -0x0c(%rsi), %xmm1
1426 jb L(L12_bwd)
1427 lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
1428L(L12_bwd):
1429 lea -64(%rdx), %rdx
1430 jmp *%r9
1431 ud2
1432L(shl_12_bwd_loop_L2):
1433 prefetchnta -0x1c0(%rsi)
1434L(shl_12_bwd_loop_L1):
1435 movaps -0x1c(%rsi), %xmm2
1436 sub $0x40, %rdx
1437 movaps -0x2c(%rsi), %xmm3
1438 movaps -0x3c(%rsi), %xmm4
1439 movaps -0x4c(%rsi), %xmm5
1440 lea -0x40(%rsi), %rsi
1441 palignr $12, %xmm2, %xmm1
1442 palignr $12, %xmm3, %xmm2
1443 palignr $12, %xmm4, %xmm3
1444 palignr $12, %xmm5, %xmm4
1445
1446 movaps %xmm1, -0x10(%rdi)
1447 movaps %xmm5, %xmm1
1448
1449 movaps %xmm2, -0x20(%rdi)
1450 lea -0x40(%rdi), %rdi
1451
1452 movaps %xmm3, 0x10(%rdi)
1453 jb L(shl_12_bwd_end)
1454 movaps %xmm4, (%rdi)
1455 jmp *%r9
1456 ud2
1457L(shl_12_bwd_end):
1458 movaps %xmm4, (%rdi)
1459 lea 64(%rdx), %rdx
1460 movdqu %xmm0, (%r8)
1461 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1462
1463 .p2align 4
1464L(shl_13):
1465 lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
1466 cmp %rcx, %rdx
1467 movaps -0x0d(%rsi), %xmm1
1468 jb L(L13_fwd)
1469 lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
1470L(L13_fwd):
1471 lea -64(%rdx), %rdx
1472 jmp *%r9
1473 ud2
1474L(shl_13_loop_L2):
1475 prefetchnta 0x1c0(%rsi)
1476L(shl_13_loop_L1):
1477 sub $64, %rdx
1478 movaps 0x03(%rsi), %xmm2
1479 movaps 0x13(%rsi), %xmm3
1480 movaps 0x23(%rsi), %xmm4
1481 movaps 0x33(%rsi), %xmm5
1482 movdqa %xmm5, %xmm6
1483 palignr $13, %xmm4, %xmm5
1484 lea 64(%rsi), %rsi
1485 palignr $13, %xmm3, %xmm4
1486 palignr $13, %xmm2, %xmm3
1487 lea 64(%rdi), %rdi
1488 palignr $13, %xmm1, %xmm2
1489 movdqa %xmm6, %xmm1
1490 movdqa %xmm2, -0x40(%rdi)
1491 movaps %xmm3, -0x30(%rdi)
1492 jb L(shl_13_end)
1493 movaps %xmm4, -0x20(%rdi)
1494 movaps %xmm5, -0x10(%rdi)
1495 jmp *%r9
1496 ud2
1497L(shl_13_end):
1498 movaps %xmm4, -0x20(%rdi)
1499 lea 64(%rdx), %rdx
1500 movaps %xmm5, -0x10(%rdi)
1501 add %rdx, %rdi
1502 movdqu %xmm0, (%r8)
1503 add %rdx, %rsi
1504 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1505
1506 .p2align 4
1507L(shl_13_bwd):
1508 lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
1509 cmp %rcx, %rdx
1510 movaps -0x0d(%rsi), %xmm1
1511 jb L(L13_bwd)
1512 lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
1513L(L13_bwd):
1514 lea -64(%rdx), %rdx
1515 jmp *%r9
1516 ud2
1517L(shl_13_bwd_loop_L2):
1518 prefetchnta -0x1c0(%rsi)
1519L(shl_13_bwd_loop_L1):
1520 movaps -0x1d(%rsi), %xmm2
1521 sub $0x40, %rdx
1522 movaps -0x2d(%rsi), %xmm3
1523 movaps -0x3d(%rsi), %xmm4
1524 movaps -0x4d(%rsi), %xmm5
1525 lea -0x40(%rsi), %rsi
1526 palignr $13, %xmm2, %xmm1
1527 palignr $13, %xmm3, %xmm2
1528 palignr $13, %xmm4, %xmm3
1529 palignr $13, %xmm5, %xmm4
1530
1531 movaps %xmm1, -0x10(%rdi)
1532 movaps %xmm5, %xmm1
1533
1534 movaps %xmm2, -0x20(%rdi)
1535 lea -0x40(%rdi), %rdi
1536
1537 movaps %xmm3, 0x10(%rdi)
1538 jb L(shl_13_bwd_end)
1539 movaps %xmm4, (%rdi)
1540 jmp *%r9
1541 ud2
1542L(shl_13_bwd_end):
1543 movaps %xmm4, (%rdi)
1544 lea 64(%rdx), %rdx
1545 movdqu %xmm0, (%r8)
1546 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1547
1548 .p2align 4
1549L(shl_14):
1550 lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
1551 cmp %rcx, %rdx
1552 movaps -0x0e(%rsi), %xmm1
1553 jb L(L14_fwd)
1554 lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
1555L(L14_fwd):
1556 lea -64(%rdx), %rdx
1557 jmp *%r9
1558 ud2
1559L(shl_14_loop_L2):
1560 prefetchnta 0x1c0(%rsi)
1561L(shl_14_loop_L1):
1562 sub $64, %rdx
1563 movaps 0x02(%rsi), %xmm2
1564 movaps 0x12(%rsi), %xmm3
1565 movaps 0x22(%rsi), %xmm4
1566 movaps 0x32(%rsi), %xmm5
1567 movdqa %xmm5, %xmm6
1568 palignr $14, %xmm4, %xmm5
1569 lea 64(%rsi), %rsi
1570 palignr $14, %xmm3, %xmm4
1571 palignr $14, %xmm2, %xmm3
1572 lea 64(%rdi), %rdi
1573 palignr $14, %xmm1, %xmm2
1574 movdqa %xmm6, %xmm1
1575 movdqa %xmm2, -0x40(%rdi)
1576 movaps %xmm3, -0x30(%rdi)
1577 jb L(shl_14_end)
1578 movaps %xmm4, -0x20(%rdi)
1579 movaps %xmm5, -0x10(%rdi)
1580 jmp *%r9
1581 ud2
1582L(shl_14_end):
1583 movaps %xmm4, -0x20(%rdi)
1584 lea 64(%rdx), %rdx
1585 movaps %xmm5, -0x10(%rdi)
1586 add %rdx, %rdi
1587 movdqu %xmm0, (%r8)
1588 add %rdx, %rsi
1589 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1590
1591 .p2align 4
1592L(shl_14_bwd):
1593 lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
1594 cmp %rcx, %rdx
1595 movaps -0x0e(%rsi), %xmm1
1596 jb L(L14_bwd)
1597 lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
1598L(L14_bwd):
1599 lea -64(%rdx), %rdx
1600 jmp *%r9
1601 ud2
1602L(shl_14_bwd_loop_L2):
1603 prefetchnta -0x1c0(%rsi)
1604L(shl_14_bwd_loop_L1):
1605 movaps -0x1e(%rsi), %xmm2
1606 sub $0x40, %rdx
1607 movaps -0x2e(%rsi), %xmm3
1608 movaps -0x3e(%rsi), %xmm4
1609 movaps -0x4e(%rsi), %xmm5
1610 lea -0x40(%rsi), %rsi
1611 palignr $14, %xmm2, %xmm1
1612 palignr $14, %xmm3, %xmm2
1613 palignr $14, %xmm4, %xmm3
1614 palignr $14, %xmm5, %xmm4
1615
1616 movaps %xmm1, -0x10(%rdi)
1617 movaps %xmm5, %xmm1
1618
1619 movaps %xmm2, -0x20(%rdi)
1620 lea -0x40(%rdi), %rdi
1621
1622 movaps %xmm3, 0x10(%rdi)
1623 jb L(shl_14_bwd_end)
1624 movaps %xmm4, (%rdi)
1625 jmp *%r9
1626 ud2
1627L(shl_14_bwd_end):
1628 movaps %xmm4, (%rdi)
1629 lea 64(%rdx), %rdx
1630 movdqu %xmm0, (%r8)
1631 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1632
1633 .p2align 4
1634L(shl_15):
1635 lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
1636 cmp %rcx, %rdx
1637 movaps -0x0f(%rsi), %xmm1
1638 jb L(L15_fwd)
1639 lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
1640L(L15_fwd):
1641 lea -64(%rdx), %rdx
1642 jmp *%r9
1643 ud2
1644L(shl_15_loop_L2):
1645 prefetchnta 0x1c0(%rsi)
1646L(shl_15_loop_L1):
1647 sub $64, %rdx
1648 movaps 0x01(%rsi), %xmm2
1649 movaps 0x11(%rsi), %xmm3
1650 movaps 0x21(%rsi), %xmm4
1651 movaps 0x31(%rsi), %xmm5
1652 movdqa %xmm5, %xmm6
1653 palignr $15, %xmm4, %xmm5
1654 lea 64(%rsi), %rsi
1655 palignr $15, %xmm3, %xmm4
1656 palignr $15, %xmm2, %xmm3
1657 lea 64(%rdi), %rdi
1658 palignr $15, %xmm1, %xmm2
1659 movdqa %xmm6, %xmm1
1660 movdqa %xmm2, -0x40(%rdi)
1661 movaps %xmm3, -0x30(%rdi)
1662 jb L(shl_15_end)
1663 movaps %xmm4, -0x20(%rdi)
1664 movaps %xmm5, -0x10(%rdi)
1665 jmp *%r9
1666 ud2
1667L(shl_15_end):
1668 movaps %xmm4, -0x20(%rdi)
1669 lea 64(%rdx), %rdx
1670 movaps %xmm5, -0x10(%rdi)
1671 add %rdx, %rdi
1672 movdqu %xmm0, (%r8)
1673 add %rdx, %rsi
1674 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1675
1676 .p2align 4
1677L(shl_15_bwd):
1678 lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
1679 cmp %rcx, %rdx
1680 movaps -0x0f(%rsi), %xmm1
1681 jb L(L15_bwd)
1682 lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
1683L(L15_bwd):
1684 lea -64(%rdx), %rdx
1685 jmp *%r9
1686 ud2
1687L(shl_15_bwd_loop_L2):
1688 prefetchnta -0x1c0(%rsi)
1689L(shl_15_bwd_loop_L1):
1690 movaps -0x1f(%rsi), %xmm2
1691 sub $0x40, %rdx
1692 movaps -0x2f(%rsi), %xmm3
1693 movaps -0x3f(%rsi), %xmm4
1694 movaps -0x4f(%rsi), %xmm5
1695 lea -0x40(%rsi), %rsi
1696 palignr $15, %xmm2, %xmm1
1697 palignr $15, %xmm3, %xmm2
1698 palignr $15, %xmm4, %xmm3
1699 palignr $15, %xmm5, %xmm4
1700
1701 movaps %xmm1, -0x10(%rdi)
1702 movaps %xmm5, %xmm1
1703
1704 movaps %xmm2, -0x20(%rdi)
1705 lea -0x40(%rdi), %rdi
1706
1707 movaps %xmm3, 0x10(%rdi)
1708 jb L(shl_15_bwd_end)
1709 movaps %xmm4, (%rdi)
1710 jmp *%r9
1711 ud2
1712L(shl_15_bwd_end):
1713 movaps %xmm4, (%rdi)
1714 lea 64(%rdx), %rdx
1715 movdqu %xmm0, (%r8)
1716 BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
1717
1718 .p2align 4
1719L(write_72bytes):
1720 movdqu -72(%rsi), %xmm0
1721 movdqu -56(%rsi), %xmm1
1722 mov -40(%rsi), %r8
1723 mov -32(%rsi), %r9
1724 mov -24(%rsi), %r10
1725 mov -16(%rsi), %r11
1726 mov -8(%rsi), %rcx
1727 movdqu %xmm0, -72(%rdi)
1728 movdqu %xmm1, -56(%rdi)
1729 mov %r8, -40(%rdi)
1730 mov %r9, -32(%rdi)
1731 mov %r10, -24(%rdi)
1732 mov %r11, -16(%rdi)
1733 mov %rcx, -8(%rdi)
1734 ret
1735
1736 .p2align 4
1737L(write_64bytes):
1738 movdqu -64(%rsi), %xmm0
1739 mov -48(%rsi), %rcx
1740 mov -40(%rsi), %r8
1741 mov -32(%rsi), %r9
1742 mov -24(%rsi), %r10
1743 mov -16(%rsi), %r11
1744 mov -8(%rsi), %rdx
1745 movdqu %xmm0, -64(%rdi)
1746 mov %rcx, -48(%rdi)
1747 mov %r8, -40(%rdi)
1748 mov %r9, -32(%rdi)
1749 mov %r10, -24(%rdi)
1750 mov %r11, -16(%rdi)
1751 mov %rdx, -8(%rdi)
1752 ret
1753
1754 .p2align 4
1755L(write_56bytes):
1756 movdqu -56(%rsi), %xmm0
1757 mov -40(%rsi), %r8
1758 mov -32(%rsi), %r9
1759 mov -24(%rsi), %r10
1760 mov -16(%rsi), %r11
1761 mov -8(%rsi), %rcx
1762 movdqu %xmm0, -56(%rdi)
1763 mov %r8, -40(%rdi)
1764 mov %r9, -32(%rdi)
1765 mov %r10, -24(%rdi)
1766 mov %r11, -16(%rdi)
1767 mov %rcx, -8(%rdi)
1768 ret
1769
1770 .p2align 4
1771L(write_48bytes):
1772 mov -48(%rsi), %rcx
1773 mov -40(%rsi), %r8
1774 mov -32(%rsi), %r9
1775 mov -24(%rsi), %r10
1776 mov -16(%rsi), %r11
1777 mov -8(%rsi), %rdx
1778 mov %rcx, -48(%rdi)
1779 mov %r8, -40(%rdi)
1780 mov %r9, -32(%rdi)
1781 mov %r10, -24(%rdi)
1782 mov %r11, -16(%rdi)
1783 mov %rdx, -8(%rdi)
1784 ret
1785
1786 .p2align 4
1787L(write_40bytes):
1788 mov -40(%rsi), %r8
1789 mov -32(%rsi), %r9
1790 mov -24(%rsi), %r10
1791 mov -16(%rsi), %r11
1792 mov -8(%rsi), %rdx
1793 mov %r8, -40(%rdi)
1794 mov %r9, -32(%rdi)
1795 mov %r10, -24(%rdi)
1796 mov %r11, -16(%rdi)
1797 mov %rdx, -8(%rdi)
1798 ret
1799
1800 .p2align 4
1801L(write_32bytes):
1802 mov -32(%rsi), %r9
1803 mov -24(%rsi), %r10
1804 mov -16(%rsi), %r11
1805 mov -8(%rsi), %rdx
1806 mov %r9, -32(%rdi)
1807 mov %r10, -24(%rdi)
1808 mov %r11, -16(%rdi)
1809 mov %rdx, -8(%rdi)
1810 ret
1811
1812 .p2align 4
1813L(write_24bytes):
1814 mov -24(%rsi), %r10
1815 mov -16(%rsi), %r11
1816 mov -8(%rsi), %rdx
1817 mov %r10, -24(%rdi)
1818 mov %r11, -16(%rdi)
1819 mov %rdx, -8(%rdi)
1820 ret
1821
1822 .p2align 4
1823L(write_16bytes):
1824 mov -16(%rsi), %r11
1825 mov -8(%rsi), %rdx
1826 mov %r11, -16(%rdi)
1827 mov %rdx, -8(%rdi)
1828 ret
1829
1830 .p2align 4
1831L(write_8bytes):
1832 mov -8(%rsi), %rdx
1833 mov %rdx, -8(%rdi)
1834L(write_0bytes):
1835 ret
1836
1837 .p2align 4
1838L(write_73bytes):
1839 movdqu -73(%rsi), %xmm0
1840 movdqu -57(%rsi), %xmm1
1841 mov -41(%rsi), %rcx
1842 mov -33(%rsi), %r9
1843 mov -25(%rsi), %r10
1844 mov -17(%rsi), %r11
1845 mov -9(%rsi), %r8
1846 mov -4(%rsi), %edx
1847 movdqu %xmm0, -73(%rdi)
1848 movdqu %xmm1, -57(%rdi)
1849 mov %rcx, -41(%rdi)
1850 mov %r9, -33(%rdi)
1851 mov %r10, -25(%rdi)
1852 mov %r11, -17(%rdi)
1853 mov %r8, -9(%rdi)
1854 mov %edx, -4(%rdi)
1855 ret
1856
1857 .p2align 4
1858L(write_65bytes):
1859 movdqu -65(%rsi), %xmm0
1860 movdqu -49(%rsi), %xmm1
1861 mov -33(%rsi), %r9
1862 mov -25(%rsi), %r10
1863 mov -17(%rsi), %r11
1864 mov -9(%rsi), %rcx
1865 mov -4(%rsi), %edx
1866 movdqu %xmm0, -65(%rdi)
1867 movdqu %xmm1, -49(%rdi)
1868 mov %r9, -33(%rdi)
1869 mov %r10, -25(%rdi)
1870 mov %r11, -17(%rdi)
1871 mov %rcx, -9(%rdi)
1872 mov %edx, -4(%rdi)
1873 ret
1874
1875 .p2align 4
1876L(write_57bytes):
1877 movdqu -57(%rsi), %xmm0
1878 mov -41(%rsi), %r8
1879 mov -33(%rsi), %r9
1880 mov -25(%rsi), %r10
1881 mov -17(%rsi), %r11
1882 mov -9(%rsi), %rcx
1883 mov -4(%rsi), %edx
1884 movdqu %xmm0, -57(%rdi)
1885 mov %r8, -41(%rdi)
1886 mov %r9, -33(%rdi)
1887 mov %r10, -25(%rdi)
1888 mov %r11, -17(%rdi)
1889 mov %rcx, -9(%rdi)
1890 mov %edx, -4(%rdi)
1891 ret
1892
1893 .p2align 4
1894L(write_49bytes):
1895 movdqu -49(%rsi), %xmm0
1896 mov -33(%rsi), %r9
1897 mov -25(%rsi), %r10
1898 mov -17(%rsi), %r11
1899 mov -9(%rsi), %rcx
1900 mov -4(%rsi), %edx
1901 movdqu %xmm0, -49(%rdi)
1902 mov %r9, -33(%rdi)
1903 mov %r10, -25(%rdi)
1904 mov %r11, -17(%rdi)
1905 mov %rcx, -9(%rdi)
1906 mov %edx, -4(%rdi)
1907 ret
1908
1909 .p2align 4
1910L(write_41bytes):
1911 mov -41(%rsi), %r8
1912 mov -33(%rsi), %r9
1913 mov -25(%rsi), %r10
1914 mov -17(%rsi), %r11
1915 mov -9(%rsi), %rcx
1916 mov -1(%rsi), %dl
1917 mov %r8, -41(%rdi)
1918 mov %r9, -33(%rdi)
1919 mov %r10, -25(%rdi)
1920 mov %r11, -17(%rdi)
1921 mov %rcx, -9(%rdi)
1922 mov %dl, -1(%rdi)
1923 ret
1924
1925 .p2align 4
1926L(write_33bytes):
1927 mov -33(%rsi), %r9
1928 mov -25(%rsi), %r10
1929 mov -17(%rsi), %r11
1930 mov -9(%rsi), %rcx
1931 mov -1(%rsi), %dl
1932 mov %r9, -33(%rdi)
1933 mov %r10, -25(%rdi)
1934 mov %r11, -17(%rdi)
1935 mov %rcx, -9(%rdi)
1936 mov %dl, -1(%rdi)
1937 ret
1938
1939 .p2align 4
1940L(write_25bytes):
1941 mov -25(%rsi), %r10
1942 mov -17(%rsi), %r11
1943 mov -9(%rsi), %rcx
1944 mov -1(%rsi), %dl
1945 mov %r10, -25(%rdi)
1946 mov %r11, -17(%rdi)
1947 mov %rcx, -9(%rdi)
1948 mov %dl, -1(%rdi)
1949 ret
1950
1951 .p2align 4
1952L(write_17bytes):
1953 mov -17(%rsi), %r11
1954 mov -9(%rsi), %rcx
1955 mov -4(%rsi), %edx
1956 mov %r11, -17(%rdi)
1957 mov %rcx, -9(%rdi)
1958 mov %edx, -4(%rdi)
1959 ret
1960
1961 .p2align 4
1962L(write_9bytes):
1963 mov -9(%rsi), %rcx
1964 mov -4(%rsi), %edx
1965 mov %rcx, -9(%rdi)
1966 mov %edx, -4(%rdi)
1967 ret
1968
1969 .p2align 4
1970L(write_1bytes):
1971 mov -1(%rsi), %dl
1972 mov %dl, -1(%rdi)
1973 ret
1974
1975 .p2align 4
1976L(write_74bytes):
1977 movdqu -74(%rsi), %xmm0
1978 movdqu -58(%rsi), %xmm1
1979 mov -42(%rsi), %r8
1980 mov -34(%rsi), %r9
1981 mov -26(%rsi), %r10
1982 mov -18(%rsi), %r11
1983 mov -10(%rsi), %rcx
1984 mov -4(%rsi), %edx
1985 movdqu %xmm0, -74(%rdi)
1986 movdqu %xmm1, -58(%rdi)
1987 mov %r8, -42(%rdi)
1988 mov %r9, -34(%rdi)
1989 mov %r10, -26(%rdi)
1990 mov %r11, -18(%rdi)
1991 mov %rcx, -10(%rdi)
1992 mov %edx, -4(%rdi)
1993 ret
1994
1995 .p2align 4
1996L(write_66bytes):
1997 movdqu -66(%rsi), %xmm0
1998 movdqu -50(%rsi), %xmm1
1999 mov -42(%rsi), %r8
2000 mov -34(%rsi), %r9
2001 mov -26(%rsi), %r10
2002 mov -18(%rsi), %r11
2003 mov -10(%rsi), %rcx
2004 mov -4(%rsi), %edx
2005 movdqu %xmm0, -66(%rdi)
2006 movdqu %xmm1, -50(%rdi)
2007 mov %r8, -42(%rdi)
2008 mov %r9, -34(%rdi)
2009 mov %r10, -26(%rdi)
2010 mov %r11, -18(%rdi)
2011 mov %rcx, -10(%rdi)
2012 mov %edx, -4(%rdi)
2013 ret
2014
2015 .p2align 4
2016L(write_58bytes):
2017 movdqu -58(%rsi), %xmm1
2018 mov -42(%rsi), %r8
2019 mov -34(%rsi), %r9
2020 mov -26(%rsi), %r10
2021 mov -18(%rsi), %r11
2022 mov -10(%rsi), %rcx
2023 mov -4(%rsi), %edx
2024 movdqu %xmm1, -58(%rdi)
2025 mov %r8, -42(%rdi)
2026 mov %r9, -34(%rdi)
2027 mov %r10, -26(%rdi)
2028 mov %r11, -18(%rdi)
2029 mov %rcx, -10(%rdi)
2030 mov %edx, -4(%rdi)
2031 ret
2032
2033 .p2align 4
2034L(write_50bytes):
2035 movdqu -50(%rsi), %xmm0
2036 mov -34(%rsi), %r9
2037 mov -26(%rsi), %r10
2038 mov -18(%rsi), %r11
2039 mov -10(%rsi), %rcx
2040 mov -4(%rsi), %edx
2041 movdqu %xmm0, -50(%rdi)
2042 mov %r9, -34(%rdi)
2043 mov %r10, -26(%rdi)
2044 mov %r11, -18(%rdi)
2045 mov %rcx, -10(%rdi)
2046 mov %edx, -4(%rdi)
2047 ret
2048
2049 .p2align 4
2050L(write_42bytes):
2051 mov -42(%rsi), %r8
2052 mov -34(%rsi), %r9
2053 mov -26(%rsi), %r10
2054 mov -18(%rsi), %r11
2055 mov -10(%rsi), %rcx
2056 mov -4(%rsi), %edx
2057 mov %r8, -42(%rdi)
2058 mov %r9, -34(%rdi)
2059 mov %r10, -26(%rdi)
2060 mov %r11, -18(%rdi)
2061 mov %rcx, -10(%rdi)
2062 mov %edx, -4(%rdi)
2063 ret
2064
2065 .p2align 4
2066L(write_34bytes):
2067 mov -34(%rsi), %r9
2068 mov -26(%rsi), %r10
2069 mov -18(%rsi), %r11
2070 mov -10(%rsi), %rcx
2071 mov -4(%rsi), %edx
2072 mov %r9, -34(%rdi)
2073 mov %r10, -26(%rdi)
2074 mov %r11, -18(%rdi)
2075 mov %rcx, -10(%rdi)
2076 mov %edx, -4(%rdi)
2077 ret
2078
2079 .p2align 4
2080L(write_26bytes):
2081 mov -26(%rsi), %r10
2082 mov -18(%rsi), %r11
2083 mov -10(%rsi), %rcx
2084 mov -4(%rsi), %edx
2085 mov %r10, -26(%rdi)
2086 mov %r11, -18(%rdi)
2087 mov %rcx, -10(%rdi)
2088 mov %edx, -4(%rdi)
2089 ret
2090
2091 .p2align 4
2092L(write_18bytes):
2093 mov -18(%rsi), %r11
2094 mov -10(%rsi), %rcx
2095 mov -4(%rsi), %edx
2096 mov %r11, -18(%rdi)
2097 mov %rcx, -10(%rdi)
2098 mov %edx, -4(%rdi)
2099 ret
2100
2101 .p2align 4
2102L(write_10bytes):
2103 mov -10(%rsi), %rcx
2104 mov -4(%rsi), %edx
2105 mov %rcx, -10(%rdi)
2106 mov %edx, -4(%rdi)
2107 ret
2108
2109 .p2align 4
2110L(write_2bytes):
2111 mov -2(%rsi), %dx
2112 mov %dx, -2(%rdi)
2113 ret
2114
2115 .p2align 4
2116L(write_75bytes):
2117 movdqu -75(%rsi), %xmm0
2118 movdqu -59(%rsi), %xmm1
2119 mov -43(%rsi), %r8
2120 mov -35(%rsi), %r9
2121 mov -27(%rsi), %r10
2122 mov -19(%rsi), %r11
2123 mov -11(%rsi), %rcx
2124 mov -4(%rsi), %edx
2125 movdqu %xmm0, -75(%rdi)
2126 movdqu %xmm1, -59(%rdi)
2127 mov %r8, -43(%rdi)
2128 mov %r9, -35(%rdi)
2129 mov %r10, -27(%rdi)
2130 mov %r11, -19(%rdi)
2131 mov %rcx, -11(%rdi)
2132 mov %edx, -4(%rdi)
2133 ret
2134
2135 .p2align 4
2136L(write_67bytes):
2137 movdqu -67(%rsi), %xmm0
2138 movdqu -59(%rsi), %xmm1
2139 mov -43(%rsi), %r8
2140 mov -35(%rsi), %r9
2141 mov -27(%rsi), %r10
2142 mov -19(%rsi), %r11
2143 mov -11(%rsi), %rcx
2144 mov -4(%rsi), %edx
2145 movdqu %xmm0, -67(%rdi)
2146 movdqu %xmm1, -59(%rdi)
2147 mov %r8, -43(%rdi)
2148 mov %r9, -35(%rdi)
2149 mov %r10, -27(%rdi)
2150 mov %r11, -19(%rdi)
2151 mov %rcx, -11(%rdi)
2152 mov %edx, -4(%rdi)
2153 ret
2154
2155 .p2align 4
2156L(write_59bytes):
2157 movdqu -59(%rsi), %xmm0
2158 mov -43(%rsi), %r8
2159 mov -35(%rsi), %r9
2160 mov -27(%rsi), %r10
2161 mov -19(%rsi), %r11
2162 mov -11(%rsi), %rcx
2163 mov -4(%rsi), %edx
2164 movdqu %xmm0, -59(%rdi)
2165 mov %r8, -43(%rdi)
2166 mov %r9, -35(%rdi)
2167 mov %r10, -27(%rdi)
2168 mov %r11, -19(%rdi)
2169 mov %rcx, -11(%rdi)
2170 mov %edx, -4(%rdi)
2171 ret
2172
2173 .p2align 4
2174L(write_51bytes):
2175 movdqu -51(%rsi), %xmm0
2176 mov -35(%rsi), %r9
2177 mov -27(%rsi), %r10
2178 mov -19(%rsi), %r11
2179 mov -11(%rsi), %rcx
2180 mov -4(%rsi), %edx
2181 movdqu %xmm0, -51(%rdi)
2182 mov %r9, -35(%rdi)
2183 mov %r10, -27(%rdi)
2184 mov %r11, -19(%rdi)
2185 mov %rcx, -11(%rdi)
2186 mov %edx, -4(%rdi)
2187 ret
2188
2189 .p2align 4
2190L(write_43bytes):
2191 mov -43(%rsi), %r8
2192 mov -35(%rsi), %r9
2193 mov -27(%rsi), %r10
2194 mov -19(%rsi), %r11
2195 mov -11(%rsi), %rcx
2196 mov -4(%rsi), %edx
2197 mov %r8, -43(%rdi)
2198 mov %r9, -35(%rdi)
2199 mov %r10, -27(%rdi)
2200 mov %r11, -19(%rdi)
2201 mov %rcx, -11(%rdi)
2202 mov %edx, -4(%rdi)
2203 ret
2204
2205 .p2align 4
2206L(write_35bytes):
2207 mov -35(%rsi), %r9
2208 mov -27(%rsi), %r10
2209 mov -19(%rsi), %r11
2210 mov -11(%rsi), %rcx
2211 mov -4(%rsi), %edx
2212 mov %r9, -35(%rdi)
2213 mov %r10, -27(%rdi)
2214 mov %r11, -19(%rdi)
2215 mov %rcx, -11(%rdi)
2216 mov %edx, -4(%rdi)
2217 ret
2218
2219 .p2align 4
2220L(write_27bytes):
2221 mov -27(%rsi), %r10
2222 mov -19(%rsi), %r11
2223 mov -11(%rsi), %rcx
2224 mov -4(%rsi), %edx
2225 mov %r10, -27(%rdi)
2226 mov %r11, -19(%rdi)
2227 mov %rcx, -11(%rdi)
2228 mov %edx, -4(%rdi)
2229 ret
2230
2231 .p2align 4
2232L(write_19bytes):
2233 mov -19(%rsi), %r11
2234 mov -11(%rsi), %rcx
2235 mov -4(%rsi), %edx
2236 mov %r11, -19(%rdi)
2237 mov %rcx, -11(%rdi)
2238 mov %edx, -4(%rdi)
2239 ret
2240
2241 .p2align 4
2242L(write_11bytes):
2243 mov -11(%rsi), %rcx
2244 mov -4(%rsi), %edx
2245 mov %rcx, -11(%rdi)
2246 mov %edx, -4(%rdi)
2247 ret
2248
2249 .p2align 4
2250L(write_3bytes):
2251 mov -3(%rsi), %dx
2252 mov -2(%rsi), %cx
2253 mov %dx, -3(%rdi)
2254 mov %cx, -2(%rdi)
2255 ret
2256
2257 .p2align 4
2258L(write_76bytes):
2259 movdqu -76(%rsi), %xmm0
2260 movdqu -60(%rsi), %xmm1
2261 mov -44(%rsi), %r8
2262 mov -36(%rsi), %r9
2263 mov -28(%rsi), %r10
2264 mov -20(%rsi), %r11
2265 mov -12(%rsi), %rcx
2266 mov -4(%rsi), %edx
2267 movdqu %xmm0, -76(%rdi)
2268 movdqu %xmm1, -60(%rdi)
2269 mov %r8, -44(%rdi)
2270 mov %r9, -36(%rdi)
2271 mov %r10, -28(%rdi)
2272 mov %r11, -20(%rdi)
2273 mov %rcx, -12(%rdi)
2274 mov %edx, -4(%rdi)
2275 ret
2276
2277 .p2align 4
2278L(write_68bytes):
2279 movdqu -68(%rsi), %xmm0
2280 movdqu -52(%rsi), %xmm1
2281 mov -36(%rsi), %r9
2282 mov -28(%rsi), %r10
2283 mov -20(%rsi), %r11
2284 mov -12(%rsi), %rcx
2285 mov -4(%rsi), %edx
2286 movdqu %xmm0, -68(%rdi)
2287 movdqu %xmm1, -52(%rdi)
2288 mov %r9, -36(%rdi)
2289 mov %r10, -28(%rdi)
2290 mov %r11, -20(%rdi)
2291 mov %rcx, -12(%rdi)
2292 mov %edx, -4(%rdi)
2293 ret
2294
2295 .p2align 4
2296L(write_60bytes):
2297 movdqu -60(%rsi), %xmm0
2298 mov -44(%rsi), %r8
2299 mov -36(%rsi), %r9
2300 mov -28(%rsi), %r10
2301 mov -20(%rsi), %r11
2302 mov -12(%rsi), %rcx
2303 mov -4(%rsi), %edx
2304 movdqu %xmm0, -60(%rdi)
2305 mov %r8, -44(%rdi)
2306 mov %r9, -36(%rdi)
2307 mov %r10, -28(%rdi)
2308 mov %r11, -20(%rdi)
2309 mov %rcx, -12(%rdi)
2310 mov %edx, -4(%rdi)
2311 ret
2312
2313 .p2align 4
2314L(write_52bytes):
2315 movdqu -52(%rsi), %xmm0
2316 mov -36(%rsi), %r9
2317 mov -28(%rsi), %r10
2318 mov -20(%rsi), %r11
2319 mov -12(%rsi), %rcx
2320 mov -4(%rsi), %edx
2321 movdqu %xmm0, -52(%rdi)
2322 mov %r9, -36(%rdi)
2323 mov %r10, -28(%rdi)
2324 mov %r11, -20(%rdi)
2325 mov %rcx, -12(%rdi)
2326 mov %edx, -4(%rdi)
2327 ret
2328
2329 .p2align 4
2330L(write_44bytes):
2331 mov -44(%rsi), %r8
2332 mov -36(%rsi), %r9
2333 mov -28(%rsi), %r10
2334 mov -20(%rsi), %r11
2335 mov -12(%rsi), %rcx
2336 mov -4(%rsi), %edx
2337 mov %r8, -44(%rdi)
2338 mov %r9, -36(%rdi)
2339 mov %r10, -28(%rdi)
2340 mov %r11, -20(%rdi)
2341 mov %rcx, -12(%rdi)
2342 mov %edx, -4(%rdi)
2343 ret
2344
2345 .p2align 4
2346L(write_36bytes):
2347 mov -36(%rsi), %r9
2348 mov -28(%rsi), %r10
2349 mov -20(%rsi), %r11
2350 mov -12(%rsi), %rcx
2351 mov -4(%rsi), %edx
2352 mov %r9, -36(%rdi)
2353 mov %r10, -28(%rdi)
2354 mov %r11, -20(%rdi)
2355 mov %rcx, -12(%rdi)
2356 mov %edx, -4(%rdi)
2357 ret
2358
2359 .p2align 4
2360L(write_28bytes):
2361 mov -28(%rsi), %r10
2362 mov -20(%rsi), %r11
2363 mov -12(%rsi), %rcx
2364 mov -4(%rsi), %edx
2365 mov %r10, -28(%rdi)
2366 mov %r11, -20(%rdi)
2367 mov %rcx, -12(%rdi)
2368 mov %edx, -4(%rdi)
2369 ret
2370
2371 .p2align 4
2372L(write_20bytes):
2373 mov -20(%rsi), %r11
2374 mov -12(%rsi), %rcx
2375 mov -4(%rsi), %edx
2376 mov %r11, -20(%rdi)
2377 mov %rcx, -12(%rdi)
2378 mov %edx, -4(%rdi)
2379 ret
2380
2381 .p2align 4
2382L(write_12bytes):
2383 mov -12(%rsi), %rcx
2384 mov -4(%rsi), %edx
2385 mov %rcx, -12(%rdi)
2386 mov %edx, -4(%rdi)
2387 ret
2388
2389 .p2align 4
2390L(write_4bytes):
2391 mov -4(%rsi), %edx
2392 mov %edx, -4(%rdi)
2393 ret
2394
2395 .p2align 4
2396L(write_77bytes):
2397 movdqu -77(%rsi), %xmm0
2398 movdqu -61(%rsi), %xmm1
2399 mov -45(%rsi), %r8
2400 mov -37(%rsi), %r9
2401 mov -29(%rsi), %r10
2402 mov -21(%rsi), %r11
2403 mov -13(%rsi), %rcx
2404 mov -8(%rsi), %rdx
2405 movdqu %xmm0, -77(%rdi)
2406 movdqu %xmm1, -61(%rdi)
2407 mov %r8, -45(%rdi)
2408 mov %r9, -37(%rdi)
2409 mov %r10, -29(%rdi)
2410 mov %r11, -21(%rdi)
2411 mov %rcx, -13(%rdi)
2412 mov %rdx, -8(%rdi)
2413 ret
2414
2415 .p2align 4
2416L(write_69bytes):
2417 movdqu -69(%rsi), %xmm0
2418 movdqu -53(%rsi), %xmm1
2419 mov -37(%rsi), %r9
2420 mov -29(%rsi), %r10
2421 mov -21(%rsi), %r11
2422 mov -13(%rsi), %rcx
2423 mov -8(%rsi), %rdx
2424 movdqu %xmm0, -69(%rdi)
2425 movdqu %xmm1, -53(%rdi)
2426 mov %r9, -37(%rdi)
2427 mov %r10, -29(%rdi)
2428 mov %r11, -21(%rdi)
2429 mov %rcx, -13(%rdi)
2430 mov %rdx, -8(%rdi)
2431 ret
2432
2433 .p2align 4
2434L(write_61bytes):
2435 movdqu -61(%rsi), %xmm0
2436 mov -45(%rsi), %r8
2437 mov -37(%rsi), %r9
2438 mov -29(%rsi), %r10
2439 mov -21(%rsi), %r11
2440 mov -13(%rsi), %rcx
2441 mov -8(%rsi), %rdx
2442 movdqu %xmm0, -61(%rdi)
2443 mov %r8, -45(%rdi)
2444 mov %r9, -37(%rdi)
2445 mov %r10, -29(%rdi)
2446 mov %r11, -21(%rdi)
2447 mov %rcx, -13(%rdi)
2448 mov %rdx, -8(%rdi)
2449 ret
2450
2451 .p2align 4
2452L(write_53bytes):
2453 movdqu -53(%rsi), %xmm0
2454 mov -45(%rsi), %r8
2455 mov -37(%rsi), %r9
2456 mov -29(%rsi), %r10
2457 mov -21(%rsi), %r11
2458 mov -13(%rsi), %rcx
2459 mov -8(%rsi), %rdx
2460 movdqu %xmm0, -53(%rdi)
2461 mov %r9, -37(%rdi)
2462 mov %r10, -29(%rdi)
2463 mov %r11, -21(%rdi)
2464 mov %rcx, -13(%rdi)
2465 mov %rdx, -8(%rdi)
2466 ret
2467
2468 .p2align 4
2469L(write_45bytes):
2470 mov -45(%rsi), %r8
2471 mov -37(%rsi), %r9
2472 mov -29(%rsi), %r10
2473 mov -21(%rsi), %r11
2474 mov -13(%rsi), %rcx
2475 mov -8(%rsi), %rdx
2476 mov %r8, -45(%rdi)
2477 mov %r9, -37(%rdi)
2478 mov %r10, -29(%rdi)
2479 mov %r11, -21(%rdi)
2480 mov %rcx, -13(%rdi)
2481 mov %rdx, -8(%rdi)
2482 ret
2483
2484 .p2align 4
2485L(write_37bytes):
2486 mov -37(%rsi), %r9
2487 mov -29(%rsi), %r10
2488 mov -21(%rsi), %r11
2489 mov -13(%rsi), %rcx
2490 mov -8(%rsi), %rdx
2491 mov %r9, -37(%rdi)
2492 mov %r10, -29(%rdi)
2493 mov %r11, -21(%rdi)
2494 mov %rcx, -13(%rdi)
2495 mov %rdx, -8(%rdi)
2496 ret
2497
2498 .p2align 4
2499L(write_29bytes):
2500 mov -29(%rsi), %r10
2501 mov -21(%rsi), %r11
2502 mov -13(%rsi), %rcx
2503 mov -8(%rsi), %rdx
2504 mov %r10, -29(%rdi)
2505 mov %r11, -21(%rdi)
2506 mov %rcx, -13(%rdi)
2507 mov %rdx, -8(%rdi)
2508 ret
2509
2510 .p2align 4
2511L(write_21bytes):
2512 mov -21(%rsi), %r11
2513 mov -13(%rsi), %rcx
2514 mov -8(%rsi), %rdx
2515 mov %r11, -21(%rdi)
2516 mov %rcx, -13(%rdi)
2517 mov %rdx, -8(%rdi)
2518 ret
2519
2520 .p2align 4
2521L(write_13bytes):
2522 mov -13(%rsi), %rcx
2523 mov -8(%rsi), %rdx
2524 mov %rcx, -13(%rdi)
2525 mov %rdx, -8(%rdi)
2526 ret
2527
2528 .p2align 4
2529L(write_5bytes):
2530 mov -5(%rsi), %edx
2531 mov -4(%rsi), %ecx
2532 mov %edx, -5(%rdi)
2533 mov %ecx, -4(%rdi)
2534 ret
2535
2536 .p2align 4
2537L(write_78bytes):
2538 movdqu -78(%rsi), %xmm0
2539 movdqu -62(%rsi), %xmm1
2540 mov -46(%rsi), %r8
2541 mov -38(%rsi), %r9
2542 mov -30(%rsi), %r10
2543 mov -22(%rsi), %r11
2544 mov -14(%rsi), %rcx
2545 mov -8(%rsi), %rdx
2546 movdqu %xmm0, -78(%rdi)
2547 movdqu %xmm1, -62(%rdi)
2548 mov %r8, -46(%rdi)
2549 mov %r9, -38(%rdi)
2550 mov %r10, -30(%rdi)
2551 mov %r11, -22(%rdi)
2552 mov %rcx, -14(%rdi)
2553 mov %rdx, -8(%rdi)
2554 ret
2555
2556 .p2align 4
2557L(write_70bytes):
2558 movdqu -70(%rsi), %xmm0
2559 movdqu -54(%rsi), %xmm1
2560 mov -38(%rsi), %r9
2561 mov -30(%rsi), %r10
2562 mov -22(%rsi), %r11
2563 mov -14(%rsi), %rcx
2564 mov -8(%rsi), %rdx
2565 movdqu %xmm0, -70(%rdi)
2566 movdqu %xmm1, -54(%rdi)
2567 mov %r9, -38(%rdi)
2568 mov %r10, -30(%rdi)
2569 mov %r11, -22(%rdi)
2570 mov %rcx, -14(%rdi)
2571 mov %rdx, -8(%rdi)
2572 ret
2573
2574 .p2align 4
2575L(write_62bytes):
2576 movdqu -62(%rsi), %xmm0
2577 mov -46(%rsi), %r8
2578 mov -38(%rsi), %r9
2579 mov -30(%rsi), %r10
2580 mov -22(%rsi), %r11
2581 mov -14(%rsi), %rcx
2582 mov -8(%rsi), %rdx
2583 movdqu %xmm0, -62(%rdi)
2584 mov %r8, -46(%rdi)
2585 mov %r9, -38(%rdi)
2586 mov %r10, -30(%rdi)
2587 mov %r11, -22(%rdi)
2588 mov %rcx, -14(%rdi)
2589 mov %rdx, -8(%rdi)
2590 ret
2591
2592 .p2align 4
2593L(write_54bytes):
2594 movdqu -54(%rsi), %xmm0
2595 mov -38(%rsi), %r9
2596 mov -30(%rsi), %r10
2597 mov -22(%rsi), %r11
2598 mov -14(%rsi), %rcx
2599 mov -8(%rsi), %rdx
2600 movdqu %xmm0, -54(%rdi)
2601 mov %r9, -38(%rdi)
2602 mov %r10, -30(%rdi)
2603 mov %r11, -22(%rdi)
2604 mov %rcx, -14(%rdi)
2605 mov %rdx, -8(%rdi)
2606 ret
2607
2608 .p2align 4
2609L(write_46bytes):
2610 mov -46(%rsi), %r8
2611 mov -38(%rsi), %r9
2612 mov -30(%rsi), %r10
2613 mov -22(%rsi), %r11
2614 mov -14(%rsi), %rcx
2615 mov -8(%rsi), %rdx
2616 mov %r8, -46(%rdi)
2617 mov %r9, -38(%rdi)
2618 mov %r10, -30(%rdi)
2619 mov %r11, -22(%rdi)
2620 mov %rcx, -14(%rdi)
2621 mov %rdx, -8(%rdi)
2622 ret
2623
2624 .p2align 4
2625L(write_38bytes):
2626 mov -38(%rsi), %r9
2627 mov -30(%rsi), %r10
2628 mov -22(%rsi), %r11
2629 mov -14(%rsi), %rcx
2630 mov -8(%rsi), %rdx
2631 mov %r9, -38(%rdi)
2632 mov %r10, -30(%rdi)
2633 mov %r11, -22(%rdi)
2634 mov %rcx, -14(%rdi)
2635 mov %rdx, -8(%rdi)
2636 ret
2637
2638 .p2align 4
2639L(write_30bytes):
2640 mov -30(%rsi), %r10
2641 mov -22(%rsi), %r11
2642 mov -14(%rsi), %rcx
2643 mov -8(%rsi), %rdx
2644 mov %r10, -30(%rdi)
2645 mov %r11, -22(%rdi)
2646 mov %rcx, -14(%rdi)
2647 mov %rdx, -8(%rdi)
2648 ret
2649
2650 .p2align 4
2651L(write_22bytes):
2652 mov -22(%rsi), %r11
2653 mov -14(%rsi), %rcx
2654 mov -8(%rsi), %rdx
2655 mov %r11, -22(%rdi)
2656 mov %rcx, -14(%rdi)
2657 mov %rdx, -8(%rdi)
2658 ret
2659
2660 .p2align 4
2661L(write_14bytes):
2662 mov -14(%rsi), %rcx
2663 mov -8(%rsi), %rdx
2664 mov %rcx, -14(%rdi)
2665 mov %rdx, -8(%rdi)
2666 ret
2667
2668 .p2align 4
2669L(write_6bytes):
2670 mov -6(%rsi), %edx
2671 mov -4(%rsi), %ecx
2672 mov %edx, -6(%rdi)
2673 mov %ecx, -4(%rdi)
2674 ret
2675
2676 .p2align 4
2677L(write_79bytes):
2678 movdqu -79(%rsi), %xmm0
2679 movdqu -63(%rsi), %xmm1
2680 mov -47(%rsi), %r8
2681 mov -39(%rsi), %r9
2682 mov -31(%rsi), %r10
2683 mov -23(%rsi), %r11
2684 mov -15(%rsi), %rcx
2685 mov -8(%rsi), %rdx
2686 movdqu %xmm0, -79(%rdi)
2687 movdqu %xmm1, -63(%rdi)
2688 mov %r8, -47(%rdi)
2689 mov %r9, -39(%rdi)
2690 mov %r10, -31(%rdi)
2691 mov %r11, -23(%rdi)
2692 mov %rcx, -15(%rdi)
2693 mov %rdx, -8(%rdi)
2694 ret
2695
2696 .p2align 4
2697L(write_71bytes):
2698 movdqu -71(%rsi), %xmm0
2699 movdqu -55(%rsi), %xmm1
2700 mov -39(%rsi), %r9
2701 mov -31(%rsi), %r10
2702 mov -23(%rsi), %r11
2703 mov -15(%rsi), %rcx
2704 mov -8(%rsi), %rdx
2705 movdqu %xmm0, -71(%rdi)
2706 movdqu %xmm1, -55(%rdi)
2707 mov %r9, -39(%rdi)
2708 mov %r10, -31(%rdi)
2709 mov %r11, -23(%rdi)
2710 mov %rcx, -15(%rdi)
2711 mov %rdx, -8(%rdi)
2712 ret
2713
2714 .p2align 4
2715L(write_63bytes):
2716 movdqu -63(%rsi), %xmm0
2717 mov -47(%rsi), %r8
2718 mov -39(%rsi), %r9
2719 mov -31(%rsi), %r10
2720 mov -23(%rsi), %r11
2721 mov -15(%rsi), %rcx
2722 mov -8(%rsi), %rdx
2723 movdqu %xmm0, -63(%rdi)
2724 mov %r8, -47(%rdi)
2725 mov %r9, -39(%rdi)
2726 mov %r10, -31(%rdi)
2727 mov %r11, -23(%rdi)
2728 mov %rcx, -15(%rdi)
2729 mov %rdx, -8(%rdi)
2730 ret
2731
2732 .p2align 4
2733L(write_55bytes):
2734 movdqu -55(%rsi), %xmm0
2735 mov -39(%rsi), %r9
2736 mov -31(%rsi), %r10
2737 mov -23(%rsi), %r11
2738 mov -15(%rsi), %rcx
2739 mov -8(%rsi), %rdx
2740 movdqu %xmm0, -55(%rdi)
2741 mov %r9, -39(%rdi)
2742 mov %r10, -31(%rdi)
2743 mov %r11, -23(%rdi)
2744 mov %rcx, -15(%rdi)
2745 mov %rdx, -8(%rdi)
2746 ret
2747
2748 .p2align 4
2749L(write_47bytes):
2750 mov -47(%rsi), %r8
2751 mov -39(%rsi), %r9
2752 mov -31(%rsi), %r10
2753 mov -23(%rsi), %r11
2754 mov -15(%rsi), %rcx
2755 mov -8(%rsi), %rdx
2756 mov %r8, -47(%rdi)
2757 mov %r9, -39(%rdi)
2758 mov %r10, -31(%rdi)
2759 mov %r11, -23(%rdi)
2760 mov %rcx, -15(%rdi)
2761 mov %rdx, -8(%rdi)
2762 ret
2763
2764 .p2align 4
2765L(write_39bytes):
2766 mov -39(%rsi), %r9
2767 mov -31(%rsi), %r10
2768 mov -23(%rsi), %r11
2769 mov -15(%rsi), %rcx
2770 mov -8(%rsi), %rdx
2771 mov %r9, -39(%rdi)
2772 mov %r10, -31(%rdi)
2773 mov %r11, -23(%rdi)
2774 mov %rcx, -15(%rdi)
2775 mov %rdx, -8(%rdi)
2776 ret
2777
2778 .p2align 4
2779L(write_31bytes):
2780 mov -31(%rsi), %r10
2781 mov -23(%rsi), %r11
2782 mov -15(%rsi), %rcx
2783 mov -8(%rsi), %rdx
2784 mov %r10, -31(%rdi)
2785 mov %r11, -23(%rdi)
2786 mov %rcx, -15(%rdi)
2787 mov %rdx, -8(%rdi)
2788 ret
2789
2790 .p2align 4
2791L(write_23bytes):
2792 mov -23(%rsi), %r11
2793 mov -15(%rsi), %rcx
2794 mov -8(%rsi), %rdx
2795 mov %r11, -23(%rdi)
2796 mov %rcx, -15(%rdi)
2797 mov %rdx, -8(%rdi)
2798 ret
2799
2800 .p2align 4
2801L(write_15bytes):
2802 mov -15(%rsi), %rcx
2803 mov -8(%rsi), %rdx
2804 mov %rcx, -15(%rdi)
2805 mov %rdx, -8(%rdi)
2806 ret
2807
2808 .p2align 4
2809L(write_7bytes):
2810 mov -7(%rsi), %edx
2811 mov -4(%rsi), %ecx
2812 mov %edx, -7(%rdi)
2813 mov %ecx, -4(%rdi)
2814 ret
2815
2816 .p2align 4
2817L(large_page_fwd):
2818 movdqu (%rsi), %xmm1
2819 lea 16(%rsi), %rsi
2820 movdqu %xmm0, (%r8)
2821 movntdq %xmm1, (%rdi)
2822 lea 16(%rdi), %rdi
2823 lea -0x90(%rdx), %rdx
2824#ifdef USE_AS_MEMMOVE
2825 mov %rsi, %r9
2826 sub %rdi, %r9
2827 cmp %rdx, %r9
2828 jae L(memmove_is_memcpy_fwd)
2829 shl $2, %rcx
2830 cmp %rcx, %rdx
2831 jb L(ll_cache_copy_fwd_start)
2832L(memmove_is_memcpy_fwd):
2833#endif
2834L(large_page_loop):
2835 movdqu (%rsi), %xmm0
2836 movdqu 0x10(%rsi), %xmm1
2837 movdqu 0x20(%rsi), %xmm2
2838 movdqu 0x30(%rsi), %xmm3
2839 movdqu 0x40(%rsi), %xmm4
2840 movdqu 0x50(%rsi), %xmm5
2841 movdqu 0x60(%rsi), %xmm6
2842 movdqu 0x70(%rsi), %xmm7
2843 lea 0x80(%rsi), %rsi
2844
2845 sub $0x80, %rdx
2846 movntdq %xmm0, (%rdi)
2847 movntdq %xmm1, 0x10(%rdi)
2848 movntdq %xmm2, 0x20(%rdi)
2849 movntdq %xmm3, 0x30(%rdi)
2850 movntdq %xmm4, 0x40(%rdi)
2851 movntdq %xmm5, 0x50(%rdi)
2852 movntdq %xmm6, 0x60(%rdi)
2853 movntdq %xmm7, 0x70(%rdi)
2854 lea 0x80(%rdi), %rdi
2855 jae L(large_page_loop)
2856 cmp $-0x40, %rdx
2857 lea 0x80(%rdx), %rdx
2858 jl L(large_page_less_64bytes)
2859
2860 movdqu (%rsi), %xmm0
2861 movdqu 0x10(%rsi), %xmm1
2862 movdqu 0x20(%rsi), %xmm2
2863 movdqu 0x30(%rsi), %xmm3
2864 lea 0x40(%rsi), %rsi
2865
2866 movntdq %xmm0, (%rdi)
2867 movntdq %xmm1, 0x10(%rdi)
2868 movntdq %xmm2, 0x20(%rdi)
2869 movntdq %xmm3, 0x30(%rdi)
2870 lea 0x40(%rdi), %rdi
2871 sub $0x40, %rdx
2872L(large_page_less_64bytes):
2873 add %rdx, %rsi
2874 add %rdx, %rdi
2875 sfence
2876 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2877
2878#ifdef USE_AS_MEMMOVE
2879 .p2align 4
2880L(ll_cache_copy_fwd_start):
2881 prefetcht0 0x1c0(%rsi)
2882 prefetcht0 0x200(%rsi)
2883 movdqu (%rsi), %xmm0
2884 movdqu 0x10(%rsi), %xmm1
2885 movdqu 0x20(%rsi), %xmm2
2886 movdqu 0x30(%rsi), %xmm3
2887 movdqu 0x40(%rsi), %xmm4
2888 movdqu 0x50(%rsi), %xmm5
2889 movdqu 0x60(%rsi), %xmm6
2890 movdqu 0x70(%rsi), %xmm7
2891 lea 0x80(%rsi), %rsi
2892
2893 sub $0x80, %rdx
2894 movaps %xmm0, (%rdi)
2895 movaps %xmm1, 0x10(%rdi)
2896 movaps %xmm2, 0x20(%rdi)
2897 movaps %xmm3, 0x30(%rdi)
2898 movaps %xmm4, 0x40(%rdi)
2899 movaps %xmm5, 0x50(%rdi)
2900 movaps %xmm6, 0x60(%rdi)
2901 movaps %xmm7, 0x70(%rdi)
2902 lea 0x80(%rdi), %rdi
2903 jae L(ll_cache_copy_fwd_start)
2904 cmp $-0x40, %rdx
2905 lea 0x80(%rdx), %rdx
2906 jl L(large_page_ll_less_fwd_64bytes)
2907
2908 movdqu (%rsi), %xmm0
2909 movdqu 0x10(%rsi), %xmm1
2910 movdqu 0x20(%rsi), %xmm2
2911 movdqu 0x30(%rsi), %xmm3
2912 lea 0x40(%rsi), %rsi
2913
2914 movaps %xmm0, (%rdi)
2915 movaps %xmm1, 0x10(%rdi)
2916 movaps %xmm2, 0x20(%rdi)
2917 movaps %xmm3, 0x30(%rdi)
2918 lea 0x40(%rdi), %rdi
2919 sub $0x40, %rdx
2920L(large_page_ll_less_fwd_64bytes):
2921 add %rdx, %rsi
2922 add %rdx, %rdi
2923 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2924
2925#endif
2926 .p2align 4
2927L(large_page_bwd):
2928 movdqu -0x10(%rsi), %xmm1
2929 lea -16(%rsi), %rsi
2930 movdqu %xmm0, (%r8)
2931 movdqa %xmm1, -0x10(%rdi)
2932 lea -16(%rdi), %rdi
2933 lea -0x90(%rdx), %rdx
2934#ifdef USE_AS_MEMMOVE
2935 mov %rdi, %r9
2936 sub %rsi, %r9
2937 cmp %rdx, %r9
2938 jae L(memmove_is_memcpy_bwd)
2939 cmp %rcx, %r9
2940 jb L(ll_cache_copy_bwd_start)
2941L(memmove_is_memcpy_bwd):
2942#endif
2943L(large_page_bwd_loop):
2944 movdqu -0x10(%rsi), %xmm0
2945 movdqu -0x20(%rsi), %xmm1
2946 movdqu -0x30(%rsi), %xmm2
2947 movdqu -0x40(%rsi), %xmm3
2948 movdqu -0x50(%rsi), %xmm4
2949 movdqu -0x60(%rsi), %xmm5
2950 movdqu -0x70(%rsi), %xmm6
2951 movdqu -0x80(%rsi), %xmm7
2952 lea -0x80(%rsi), %rsi
2953
2954 sub $0x80, %rdx
2955 movntdq %xmm0, -0x10(%rdi)
2956 movntdq %xmm1, -0x20(%rdi)
2957 movntdq %xmm2, -0x30(%rdi)
2958 movntdq %xmm3, -0x40(%rdi)
2959 movntdq %xmm4, -0x50(%rdi)
2960 movntdq %xmm5, -0x60(%rdi)
2961 movntdq %xmm6, -0x70(%rdi)
2962 movntdq %xmm7, -0x80(%rdi)
2963 lea -0x80(%rdi), %rdi
2964 jae L(large_page_bwd_loop)
2965 cmp $-0x40, %rdx
2966 lea 0x80(%rdx), %rdx
2967 jl L(large_page_less_bwd_64bytes)
2968
2969 movdqu -0x10(%rsi), %xmm0
2970 movdqu -0x20(%rsi), %xmm1
2971 movdqu -0x30(%rsi), %xmm2
2972 movdqu -0x40(%rsi), %xmm3
2973 lea -0x40(%rsi), %rsi
2974
2975 movntdq %xmm0, -0x10(%rdi)
2976 movntdq %xmm1, -0x20(%rdi)
2977 movntdq %xmm2, -0x30(%rdi)
2978 movntdq %xmm3, -0x40(%rdi)
2979 lea -0x40(%rdi), %rdi
2980 sub $0x40, %rdx
2981L(large_page_less_bwd_64bytes):
2982 sfence
2983 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
2984
2985#ifdef USE_AS_MEMMOVE
2986 .p2align 4
2987L(ll_cache_copy_bwd_start):
2988 prefetcht0 -0x1c0(%rsi)
2989 prefetcht0 -0x200(%rsi)
2990 movdqu -0x10(%rsi), %xmm0
2991 movdqu -0x20(%rsi), %xmm1
2992 movdqu -0x30(%rsi), %xmm2
2993 movdqu -0x40(%rsi), %xmm3
2994 movdqu -0x50(%rsi), %xmm4
2995 movdqu -0x60(%rsi), %xmm5
2996 movdqu -0x70(%rsi), %xmm6
2997 movdqu -0x80(%rsi), %xmm7
2998 lea -0x80(%rsi), %rsi
2999
3000 sub $0x80, %rdx
3001 movaps %xmm0, -0x10(%rdi)
3002 movaps %xmm1, -0x20(%rdi)
3003 movaps %xmm2, -0x30(%rdi)
3004 movaps %xmm3, -0x40(%rdi)
3005 movaps %xmm4, -0x50(%rdi)
3006 movaps %xmm5, -0x60(%rdi)
3007 movaps %xmm6, -0x70(%rdi)
3008 movaps %xmm7, -0x80(%rdi)
3009 lea -0x80(%rdi), %rdi
3010 jae L(ll_cache_copy_bwd_start)
3011 cmp $-0x40, %rdx
3012 lea 0x80(%rdx), %rdx
3013 jl L(large_page_ll_less_bwd_64bytes)
3014
3015 movdqu -0x10(%rsi), %xmm0
3016 movdqu -0x20(%rsi), %xmm1
3017 movdqu -0x30(%rsi), %xmm2
3018 movdqu -0x40(%rsi), %xmm3
3019 lea -0x40(%rsi), %rsi
3020
3021 movaps %xmm0, -0x10(%rdi)
3022 movaps %xmm1, -0x20(%rdi)
3023 movaps %xmm2, -0x30(%rdi)
3024 movaps %xmm3, -0x40(%rdi)
3025 lea -0x40(%rdi), %rdi
3026 sub $0x40, %rdx
3027L(large_page_ll_less_bwd_64bytes):
3028 BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
3029#endif
3030
3031END (MEMCPY)
3032
3033 .section .rodata.ssse3,"a",@progbits
3034 .p2align 3
3035L(table_less_80bytes):
3036 .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
3037 .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
3038 .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
3039 .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
3040 .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
3041 .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
3042 .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
3043 .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
3044 .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
3045 .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
3046 .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
3047 .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
3048 .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
3049 .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
3050 .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
3051 .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
3052 .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
3053 .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
3054 .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
3055 .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
3056 .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
3057 .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
3058 .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
3059 .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
3060 .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
3061 .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
3062 .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
3063 .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
3064 .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
3065 .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
3066 .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
3067 .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
3068 .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
3069 .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
3070 .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
3071 .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
3072 .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
3073 .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
3074 .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
3075 .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
3076 .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
3077 .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
3078 .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
3079 .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
3080 .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
3081 .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
3082 .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
3083 .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
3084 .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
3085 .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
3086 .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
3087 .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
3088 .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
3089 .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
3090 .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
3091 .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
3092 .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
3093 .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
3094 .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
3095 .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
3096 .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
3097 .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
3098 .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
3099 .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
3100 .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
3101 .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
3102 .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
3103 .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
3104 .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
3105 .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
3106 .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
3107 .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
3108 .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
3109 .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
3110 .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
3111 .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
3112 .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
3113 .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
3114 .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
3115 .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
3116
3117 .p2align 3
3118L(shl_table):
3119 .int JMPTBL (L(shl_0), L(shl_table))
3120 .int JMPTBL (L(shl_1), L(shl_table))
3121 .int JMPTBL (L(shl_2), L(shl_table))
3122 .int JMPTBL (L(shl_3), L(shl_table))
3123 .int JMPTBL (L(shl_4), L(shl_table))
3124 .int JMPTBL (L(shl_5), L(shl_table))
3125 .int JMPTBL (L(shl_6), L(shl_table))
3126 .int JMPTBL (L(shl_7), L(shl_table))
3127 .int JMPTBL (L(shl_8), L(shl_table))
3128 .int JMPTBL (L(shl_9), L(shl_table))
3129 .int JMPTBL (L(shl_10), L(shl_table))
3130 .int JMPTBL (L(shl_11), L(shl_table))
3131 .int JMPTBL (L(shl_12), L(shl_table))
3132 .int JMPTBL (L(shl_13), L(shl_table))
3133 .int JMPTBL (L(shl_14), L(shl_table))
3134 .int JMPTBL (L(shl_15), L(shl_table))
3135
3136 .p2align 3
3137L(shl_table_bwd):
3138 .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
3139 .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
3140 .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
3141 .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
3142 .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
3143 .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
3144 .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
3145 .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
3146 .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
3147 .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
3148 .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
3149 .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
3150 .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
3151 .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
3152 .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
3153 .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
3154
3155#endif
3156