1/* Copyright (C) 2002-2016 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <assert.h>
20#include <errno.h>
21#include <signal.h>
22#include <stdint.h>
23#include <string.h>
24#include <unistd.h>
25#include <sys/mman.h>
26#include <sys/param.h>
27#include <dl-sysdep.h>
28#include <dl-tls.h>
29#include <tls.h>
30#include <list.h>
31#include <lowlevellock.h>
32#include <futex-internal.h>
33#include <kernel-features.h>
34#include <stack-aliasing.h>
35
36
37#ifndef NEED_SEPARATE_REGISTER_STACK
38
39/* Most architectures have exactly one stack pointer. Some have more. */
40# define STACK_VARIABLES void *stackaddr = NULL
41
42/* How to pass the values to the 'create_thread' function. */
43# define STACK_VARIABLES_ARGS stackaddr
44
45/* How to declare function which gets there parameters. */
46# define STACK_VARIABLES_PARMS void *stackaddr
47
48/* How to declare allocate_stack. */
49# define ALLOCATE_STACK_PARMS void **stack
50
51/* This is how the function is called. We do it this way to allow
52 other variants of the function to have more parameters. */
53# define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
54
55#else
56
57/* We need two stacks. The kernel will place them but we have to tell
58 the kernel about the size of the reserved address space. */
59# define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
60
61/* How to pass the values to the 'create_thread' function. */
62# define STACK_VARIABLES_ARGS stackaddr, stacksize
63
64/* How to declare function which gets there parameters. */
65# define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
66
67/* How to declare allocate_stack. */
68# define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
69
70/* This is how the function is called. We do it this way to allow
71 other variants of the function to have more parameters. */
72# define ALLOCATE_STACK(attr, pd) \
73 allocate_stack (attr, pd, &stackaddr, &stacksize)
74
75#endif
76
77
78/* Default alignment of stack. */
79#ifndef STACK_ALIGN
80# define STACK_ALIGN __alignof__ (long double)
81#endif
82
83/* Default value for minimal stack size after allocating thread
84 descriptor and guard. */
85#ifndef MINIMAL_REST_STACK
86# define MINIMAL_REST_STACK 4096
87#endif
88
89
90/* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
91 a stack. Use it when possible. */
92#ifndef MAP_STACK
93# define MAP_STACK 0
94#endif
95
96/* This yields the pointer that TLS support code calls the thread pointer. */
97#if TLS_TCB_AT_TP
98# define TLS_TPADJ(pd) (pd)
99#elif TLS_DTV_AT_TP
100# define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
101#endif
102
103/* Cache handling for not-yet free stacks. */
104
105/* Maximum size in kB of cache. */
106static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default. */
107static size_t stack_cache_actsize;
108
109/* Mutex protecting this variable. */
110static int stack_cache_lock = LLL_LOCK_INITIALIZER;
111
112/* List of queued stack frames. */
113static LIST_HEAD (stack_cache);
114
115/* List of the stacks in use. */
116static LIST_HEAD (stack_used);
117
118/* We need to record what list operations we are going to do so that,
119 in case of an asynchronous interruption due to a fork() call, we
120 can correct for the work. */
121static uintptr_t in_flight_stack;
122
123/* List of the threads with user provided stacks in use. No need to
124 initialize this, since it's done in __pthread_initialize_minimal. */
125list_t __stack_user __attribute__ ((nocommon));
126hidden_data_def (__stack_user)
127
128#if COLORING_INCREMENT != 0
129/* Number of threads created. */
130static unsigned int nptl_ncreated;
131#endif
132
133
134/* Check whether the stack is still used or not. */
135#define FREE_P(descr) ((descr)->tid <= 0)
136
137
138static void
139stack_list_del (list_t *elem)
140{
141 in_flight_stack = (uintptr_t) elem;
142
143 atomic_write_barrier ();
144
145 list_del (elem);
146
147 atomic_write_barrier ();
148
149 in_flight_stack = 0;
150}
151
152
153static void
154stack_list_add (list_t *elem, list_t *list)
155{
156 in_flight_stack = (uintptr_t) elem | 1;
157
158 atomic_write_barrier ();
159
160 list_add (elem, list);
161
162 atomic_write_barrier ();
163
164 in_flight_stack = 0;
165}
166
167
168/* We create a double linked list of all cache entries. Double linked
169 because this allows removing entries from the end. */
170
171
172/* Get a stack frame from the cache. We have to match by size since
173 some blocks might be too small or far too large. */
174static struct pthread *
175get_cached_stack (size_t *sizep, void **memp)
176{
177 size_t size = *sizep;
178 struct pthread *result = NULL;
179 list_t *entry;
180
181 lll_lock (stack_cache_lock, LLL_PRIVATE);
182
183 /* Search the cache for a matching entry. We search for the
184 smallest stack which has at least the required size. Note that
185 in normal situations the size of all allocated stacks is the
186 same. As the very least there are only a few different sizes.
187 Therefore this loop will exit early most of the time with an
188 exact match. */
189 list_for_each (entry, &stack_cache)
190 {
191 struct pthread *curr;
192
193 curr = list_entry (entry, struct pthread, list);
194 if (FREE_P (curr) && curr->stackblock_size >= size)
195 {
196 if (curr->stackblock_size == size)
197 {
198 result = curr;
199 break;
200 }
201
202 if (result == NULL
203 || result->stackblock_size > curr->stackblock_size)
204 result = curr;
205 }
206 }
207
208 if (__builtin_expect (result == NULL, 0)
209 /* Make sure the size difference is not too excessive. In that
210 case we do not use the block. */
211 || __builtin_expect (result->stackblock_size > 4 * size, 0))
212 {
213 /* Release the lock. */
214 lll_unlock (stack_cache_lock, LLL_PRIVATE);
215
216 return NULL;
217 }
218
219 /* Don't allow setxid until cloned. */
220 result->setxid_futex = -1;
221
222 /* Dequeue the entry. */
223 stack_list_del (&result->list);
224
225 /* And add to the list of stacks in use. */
226 stack_list_add (&result->list, &stack_used);
227
228 /* And decrease the cache size. */
229 stack_cache_actsize -= result->stackblock_size;
230
231 /* Release the lock early. */
232 lll_unlock (stack_cache_lock, LLL_PRIVATE);
233
234 /* Report size and location of the stack to the caller. */
235 *sizep = result->stackblock_size;
236 *memp = result->stackblock;
237
238 /* Cancellation handling is back to the default. */
239 result->cancelhandling = 0;
240 result->cleanup = NULL;
241
242 /* No pending event. */
243 result->nextevent = NULL;
244
245 /* Clear the DTV. */
246 dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
247 for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
248 if (! dtv[1 + cnt].pointer.is_static
249 && dtv[1 + cnt].pointer.val != TLS_DTV_UNALLOCATED)
250 free (dtv[1 + cnt].pointer.val);
251 memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
252
253 /* Re-initialize the TLS. */
254 _dl_allocate_tls_init (TLS_TPADJ (result));
255
256 return result;
257}
258
259
260/* Free stacks until cache size is lower than LIMIT. */
261void
262__free_stacks (size_t limit)
263{
264 /* We reduce the size of the cache. Remove the last entries until
265 the size is below the limit. */
266 list_t *entry;
267 list_t *prev;
268
269 /* Search from the end of the list. */
270 list_for_each_prev_safe (entry, prev, &stack_cache)
271 {
272 struct pthread *curr;
273
274 curr = list_entry (entry, struct pthread, list);
275 if (FREE_P (curr))
276 {
277 /* Unlink the block. */
278 stack_list_del (entry);
279
280 /* Account for the freed memory. */
281 stack_cache_actsize -= curr->stackblock_size;
282
283 /* Free the memory associated with the ELF TLS. */
284 _dl_deallocate_tls (TLS_TPADJ (curr), false);
285
286 /* Remove this block. This should never fail. If it does
287 something is really wrong. */
288 if (munmap (curr->stackblock, curr->stackblock_size) != 0)
289 abort ();
290
291 /* Maybe we have freed enough. */
292 if (stack_cache_actsize <= limit)
293 break;
294 }
295 }
296}
297
298
299/* Add a stack frame which is not used anymore to the stack. Must be
300 called with the cache lock held. */
301static inline void
302__attribute ((always_inline))
303queue_stack (struct pthread *stack)
304{
305 /* We unconditionally add the stack to the list. The memory may
306 still be in use but it will not be reused until the kernel marks
307 the stack as not used anymore. */
308 stack_list_add (&stack->list, &stack_cache);
309
310 stack_cache_actsize += stack->stackblock_size;
311 if (__glibc_unlikely (stack_cache_actsize > stack_cache_maxsize))
312 __free_stacks (stack_cache_maxsize);
313}
314
315
316static int
317internal_function
318change_stack_perm (struct pthread *pd
319#ifdef NEED_SEPARATE_REGISTER_STACK
320 , size_t pagemask
321#endif
322 )
323{
324#ifdef NEED_SEPARATE_REGISTER_STACK
325 void *stack = (pd->stackblock
326 + (((((pd->stackblock_size - pd->guardsize) / 2)
327 & pagemask) + pd->guardsize) & pagemask));
328 size_t len = pd->stackblock + pd->stackblock_size - stack;
329#elif _STACK_GROWS_DOWN
330 void *stack = pd->stackblock + pd->guardsize;
331 size_t len = pd->stackblock_size - pd->guardsize;
332#elif _STACK_GROWS_UP
333 void *stack = pd->stackblock;
334 size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
335#else
336# error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
337#endif
338 if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
339 return errno;
340
341 return 0;
342}
343
344
345/* Returns a usable stack for a new thread either by allocating a
346 new stack or reusing a cached stack of sufficient size.
347 ATTR must be non-NULL and point to a valid pthread_attr.
348 PDP must be non-NULL. */
349static int
350allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
351 ALLOCATE_STACK_PARMS)
352{
353 struct pthread *pd;
354 size_t size;
355 size_t pagesize_m1 = __getpagesize () - 1;
356
357 assert (powerof2 (pagesize_m1 + 1));
358 assert (TCB_ALIGNMENT >= STACK_ALIGN);
359
360 /* Get the stack size from the attribute if it is set. Otherwise we
361 use the default we determined at start time. */
362 if (attr->stacksize != 0)
363 size = attr->stacksize;
364 else
365 {
366 lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
367 size = __default_pthread_attr.stacksize;
368 lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
369 }
370
371 /* Get memory for the stack. */
372 if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
373 {
374 uintptr_t adj;
375 char *stackaddr = (char *) attr->stackaddr;
376
377 /* Assume the same layout as the _STACK_GROWS_DOWN case, with struct
378 pthread at the top of the stack block. Later we adjust the guard
379 location and stack address to match the _STACK_GROWS_UP case. */
380 if (_STACK_GROWS_UP)
381 stackaddr += attr->stacksize;
382
383 /* If the user also specified the size of the stack make sure it
384 is large enough. */
385 if (attr->stacksize != 0
386 && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
387 return EINVAL;
388
389 /* Adjust stack size for alignment of the TLS block. */
390#if TLS_TCB_AT_TP
391 adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
392 & __static_tls_align_m1;
393 assert (size > adj + TLS_TCB_SIZE);
394#elif TLS_DTV_AT_TP
395 adj = ((uintptr_t) stackaddr - __static_tls_size)
396 & __static_tls_align_m1;
397 assert (size > adj);
398#endif
399
400 /* The user provided some memory. Let's hope it matches the
401 size... We do not allocate guard pages if the user provided
402 the stack. It is the user's responsibility to do this if it
403 is wanted. */
404#if TLS_TCB_AT_TP
405 pd = (struct pthread *) ((uintptr_t) stackaddr
406 - TLS_TCB_SIZE - adj);
407#elif TLS_DTV_AT_TP
408 pd = (struct pthread *) (((uintptr_t) stackaddr
409 - __static_tls_size - adj)
410 - TLS_PRE_TCB_SIZE);
411#endif
412
413 /* The user provided stack memory needs to be cleared. */
414 memset (pd, '\0', sizeof (struct pthread));
415
416 /* The first TSD block is included in the TCB. */
417 pd->specific[0] = pd->specific_1stblock;
418
419 /* Remember the stack-related values. */
420 pd->stackblock = (char *) stackaddr - size;
421 pd->stackblock_size = size;
422
423 /* This is a user-provided stack. It will not be queued in the
424 stack cache nor will the memory (except the TLS memory) be freed. */
425 pd->user_stack = true;
426
427 /* This is at least the second thread. */
428 pd->header.multiple_threads = 1;
429#ifndef TLS_MULTIPLE_THREADS_IN_TCB
430 __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
431#endif
432
433#ifndef __ASSUME_PRIVATE_FUTEX
434 /* The thread must know when private futexes are supported. */
435 pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
436 header.private_futex);
437#endif
438
439#ifdef NEED_DL_SYSINFO
440 SETUP_THREAD_SYSINFO (pd);
441#endif
442
443 /* Don't allow setxid until cloned. */
444 pd->setxid_futex = -1;
445
446 /* Allocate the DTV for this thread. */
447 if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
448 {
449 /* Something went wrong. */
450 assert (errno == ENOMEM);
451 return errno;
452 }
453
454
455 /* Prepare to modify global data. */
456 lll_lock (stack_cache_lock, LLL_PRIVATE);
457
458 /* And add to the list of stacks in use. */
459 list_add (&pd->list, &__stack_user);
460
461 lll_unlock (stack_cache_lock, LLL_PRIVATE);
462 }
463 else
464 {
465 /* Allocate some anonymous memory. If possible use the cache. */
466 size_t guardsize;
467 size_t reqsize;
468 void *mem;
469 const int prot = (PROT_READ | PROT_WRITE
470 | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
471
472#if COLORING_INCREMENT != 0
473 /* Add one more page for stack coloring. Don't do it for stacks
474 with 16 times pagesize or larger. This might just cause
475 unnecessary misalignment. */
476 if (size <= 16 * pagesize_m1)
477 size += pagesize_m1 + 1;
478#endif
479
480 /* Adjust the stack size for alignment. */
481 size &= ~__static_tls_align_m1;
482 assert (size != 0);
483
484 /* Make sure the size of the stack is enough for the guard and
485 eventually the thread descriptor. */
486 guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
487 if (guardsize < attr->guardsize || size + guardsize < guardsize)
488 /* Arithmetic overflow. */
489 return EINVAL;
490 size += guardsize;
491 if (__builtin_expect (size < ((guardsize + __static_tls_size
492 + MINIMAL_REST_STACK + pagesize_m1)
493 & ~pagesize_m1),
494 0))
495 /* The stack is too small (or the guard too large). */
496 return EINVAL;
497
498 /* Try to get a stack from the cache. */
499 reqsize = size;
500 pd = get_cached_stack (&size, &mem);
501 if (pd == NULL)
502 {
503 /* To avoid aliasing effects on a larger scale than pages we
504 adjust the allocated stack size if necessary. This way
505 allocations directly following each other will not have
506 aliasing problems. */
507#if MULTI_PAGE_ALIASING != 0
508 if ((size % MULTI_PAGE_ALIASING) == 0)
509 size += pagesize_m1 + 1;
510#endif
511
512 mem = mmap (NULL, size, prot,
513 MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
514
515 if (__glibc_unlikely (mem == MAP_FAILED))
516 return errno;
517
518 /* SIZE is guaranteed to be greater than zero.
519 So we can never get a null pointer back from mmap. */
520 assert (mem != NULL);
521
522#if COLORING_INCREMENT != 0
523 /* Atomically increment NCREATED. */
524 unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
525
526 /* We chose the offset for coloring by incrementing it for
527 every new thread by a fixed amount. The offset used
528 module the page size. Even if coloring would be better
529 relative to higher alignment values it makes no sense to
530 do it since the mmap() interface does not allow us to
531 specify any alignment for the returned memory block. */
532 size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
533
534 /* Make sure the coloring offsets does not disturb the alignment
535 of the TCB and static TLS block. */
536 if (__glibc_unlikely ((coloring & __static_tls_align_m1) != 0))
537 coloring = (((coloring + __static_tls_align_m1)
538 & ~(__static_tls_align_m1))
539 & ~pagesize_m1);
540#else
541 /* Unless specified we do not make any adjustments. */
542# define coloring 0
543#endif
544
545 /* Place the thread descriptor at the end of the stack. */
546#if TLS_TCB_AT_TP
547 pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
548#elif TLS_DTV_AT_TP
549 pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
550 - __static_tls_size)
551 & ~__static_tls_align_m1)
552 - TLS_PRE_TCB_SIZE);
553#endif
554
555 /* Remember the stack-related values. */
556 pd->stackblock = mem;
557 pd->stackblock_size = size;
558
559 /* We allocated the first block thread-specific data array.
560 This address will not change for the lifetime of this
561 descriptor. */
562 pd->specific[0] = pd->specific_1stblock;
563
564 /* This is at least the second thread. */
565 pd->header.multiple_threads = 1;
566#ifndef TLS_MULTIPLE_THREADS_IN_TCB
567 __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
568#endif
569
570#ifndef __ASSUME_PRIVATE_FUTEX
571 /* The thread must know when private futexes are supported. */
572 pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
573 header.private_futex);
574#endif
575
576#ifdef NEED_DL_SYSINFO
577 SETUP_THREAD_SYSINFO (pd);
578#endif
579
580 /* Don't allow setxid until cloned. */
581 pd->setxid_futex = -1;
582
583 /* Allocate the DTV for this thread. */
584 if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
585 {
586 /* Something went wrong. */
587 assert (errno == ENOMEM);
588
589 /* Free the stack memory we just allocated. */
590 (void) munmap (mem, size);
591
592 return errno;
593 }
594
595
596 /* Prepare to modify global data. */
597 lll_lock (stack_cache_lock, LLL_PRIVATE);
598
599 /* And add to the list of stacks in use. */
600 stack_list_add (&pd->list, &stack_used);
601
602 lll_unlock (stack_cache_lock, LLL_PRIVATE);
603
604
605 /* There might have been a race. Another thread might have
606 caused the stacks to get exec permission while this new
607 stack was prepared. Detect if this was possible and
608 change the permission if necessary. */
609 if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
610 && (prot & PROT_EXEC) == 0, 0))
611 {
612 int err = change_stack_perm (pd
613#ifdef NEED_SEPARATE_REGISTER_STACK
614 , ~pagesize_m1
615#endif
616 );
617 if (err != 0)
618 {
619 /* Free the stack memory we just allocated. */
620 (void) munmap (mem, size);
621
622 return err;
623 }
624 }
625
626
627 /* Note that all of the stack and the thread descriptor is
628 zeroed. This means we do not have to initialize fields
629 with initial value zero. This is specifically true for
630 the 'tid' field which is always set back to zero once the
631 stack is not used anymore and for the 'guardsize' field
632 which will be read next. */
633 }
634
635 /* Create or resize the guard area if necessary. */
636 if (__glibc_unlikely (guardsize > pd->guardsize))
637 {
638#ifdef NEED_SEPARATE_REGISTER_STACK
639 char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
640#elif _STACK_GROWS_DOWN
641 char *guard = mem;
642#elif _STACK_GROWS_UP
643 char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
644#endif
645 if (mprotect (guard, guardsize, PROT_NONE) != 0)
646 {
647 mprot_error:
648 lll_lock (stack_cache_lock, LLL_PRIVATE);
649
650 /* Remove the thread from the list. */
651 stack_list_del (&pd->list);
652
653 lll_unlock (stack_cache_lock, LLL_PRIVATE);
654
655 /* Get rid of the TLS block we allocated. */
656 _dl_deallocate_tls (TLS_TPADJ (pd), false);
657
658 /* Free the stack memory regardless of whether the size
659 of the cache is over the limit or not. If this piece
660 of memory caused problems we better do not use it
661 anymore. Uh, and we ignore possible errors. There
662 is nothing we could do. */
663 (void) munmap (mem, size);
664
665 return errno;
666 }
667
668 pd->guardsize = guardsize;
669 }
670 else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
671 0))
672 {
673 /* The old guard area is too large. */
674
675#ifdef NEED_SEPARATE_REGISTER_STACK
676 char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
677 char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
678
679 if (oldguard < guard
680 && mprotect (oldguard, guard - oldguard, prot) != 0)
681 goto mprot_error;
682
683 if (mprotect (guard + guardsize,
684 oldguard + pd->guardsize - guard - guardsize,
685 prot) != 0)
686 goto mprot_error;
687#elif _STACK_GROWS_DOWN
688 if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
689 prot) != 0)
690 goto mprot_error;
691#elif _STACK_GROWS_UP
692 if (mprotect ((char *) pd - pd->guardsize,
693 pd->guardsize - guardsize, prot) != 0)
694 goto mprot_error;
695#endif
696
697 pd->guardsize = guardsize;
698 }
699 /* The pthread_getattr_np() calls need to get passed the size
700 requested in the attribute, regardless of how large the
701 actually used guardsize is. */
702 pd->reported_guardsize = guardsize;
703 }
704
705 /* Initialize the lock. We have to do this unconditionally since the
706 stillborn thread could be canceled while the lock is taken. */
707 pd->lock = LLL_LOCK_INITIALIZER;
708
709 /* The robust mutex lists also need to be initialized
710 unconditionally because the cleanup for the previous stack owner
711 might have happened in the kernel. */
712 pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
713 - offsetof (pthread_mutex_t,
714 __data.__list.__next));
715 pd->robust_head.list_op_pending = NULL;
716#ifdef __PTHREAD_MUTEX_HAVE_PREV
717 pd->robust_prev = &pd->robust_head;
718#endif
719 pd->robust_head.list = &pd->robust_head;
720
721 /* We place the thread descriptor at the end of the stack. */
722 *pdp = pd;
723
724#if _STACK_GROWS_DOWN
725 void *stacktop;
726
727# if TLS_TCB_AT_TP
728 /* The stack begins before the TCB and the static TLS block. */
729 stacktop = ((char *) (pd + 1) - __static_tls_size);
730# elif TLS_DTV_AT_TP
731 stacktop = (char *) (pd - 1);
732# endif
733
734# ifdef NEED_SEPARATE_REGISTER_STACK
735 *stack = pd->stackblock;
736 *stacksize = stacktop - *stack;
737# else
738 *stack = stacktop;
739# endif
740#else
741 *stack = pd->stackblock;
742#endif
743
744 return 0;
745}
746
747
748void
749internal_function
750__deallocate_stack (struct pthread *pd)
751{
752 lll_lock (stack_cache_lock, LLL_PRIVATE);
753
754 /* Remove the thread from the list of threads with user defined
755 stacks. */
756 stack_list_del (&pd->list);
757
758 /* Not much to do. Just free the mmap()ed memory. Note that we do
759 not reset the 'used' flag in the 'tid' field. This is done by
760 the kernel. If no thread has been created yet this field is
761 still zero. */
762 if (__glibc_likely (! pd->user_stack))
763 (void) queue_stack (pd);
764 else
765 /* Free the memory associated with the ELF TLS. */
766 _dl_deallocate_tls (TLS_TPADJ (pd), false);
767
768 lll_unlock (stack_cache_lock, LLL_PRIVATE);
769}
770
771
772int
773internal_function
774__make_stacks_executable (void **stack_endp)
775{
776 /* First the main thread's stack. */
777 int err = _dl_make_stack_executable (stack_endp);
778 if (err != 0)
779 return err;
780
781#ifdef NEED_SEPARATE_REGISTER_STACK
782 const size_t pagemask = ~(__getpagesize () - 1);
783#endif
784
785 lll_lock (stack_cache_lock, LLL_PRIVATE);
786
787 list_t *runp;
788 list_for_each (runp, &stack_used)
789 {
790 err = change_stack_perm (list_entry (runp, struct pthread, list)
791#ifdef NEED_SEPARATE_REGISTER_STACK
792 , pagemask
793#endif
794 );
795 if (err != 0)
796 break;
797 }
798
799 /* Also change the permission for the currently unused stacks. This
800 might be wasted time but better spend it here than adding a check
801 in the fast path. */
802 if (err == 0)
803 list_for_each (runp, &stack_cache)
804 {
805 err = change_stack_perm (list_entry (runp, struct pthread, list)
806#ifdef NEED_SEPARATE_REGISTER_STACK
807 , pagemask
808#endif
809 );
810 if (err != 0)
811 break;
812 }
813
814 lll_unlock (stack_cache_lock, LLL_PRIVATE);
815
816 return err;
817}
818
819
820/* In case of a fork() call the memory allocation in the child will be
821 the same but only one thread is running. All stacks except that of
822 the one running thread are not used anymore. We have to recycle
823 them. */
824void
825__reclaim_stacks (void)
826{
827 struct pthread *self = (struct pthread *) THREAD_SELF;
828
829 /* No locking necessary. The caller is the only stack in use. But
830 we have to be aware that we might have interrupted a list
831 operation. */
832
833 if (in_flight_stack != 0)
834 {
835 bool add_p = in_flight_stack & 1;
836 list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
837
838 if (add_p)
839 {
840 /* We always add at the beginning of the list. So in this case we
841 only need to check the beginning of these lists to see if the
842 pointers at the head of the list are inconsistent. */
843 list_t *l = NULL;
844
845 if (stack_used.next->prev != &stack_used)
846 l = &stack_used;
847 else if (stack_cache.next->prev != &stack_cache)
848 l = &stack_cache;
849
850 if (l != NULL)
851 {
852 assert (l->next->prev == elem);
853 elem->next = l->next;
854 elem->prev = l;
855 l->next = elem;
856 }
857 }
858 else
859 {
860 /* We can simply always replay the delete operation. */
861 elem->next->prev = elem->prev;
862 elem->prev->next = elem->next;
863 }
864 }
865
866 /* Mark all stacks except the still running one as free. */
867 list_t *runp;
868 list_for_each (runp, &stack_used)
869 {
870 struct pthread *curp = list_entry (runp, struct pthread, list);
871 if (curp != self)
872 {
873 /* This marks the stack as free. */
874 curp->tid = 0;
875
876 /* Account for the size of the stack. */
877 stack_cache_actsize += curp->stackblock_size;
878
879 if (curp->specific_used)
880 {
881 /* Clear the thread-specific data. */
882 memset (curp->specific_1stblock, '\0',
883 sizeof (curp->specific_1stblock));
884
885 curp->specific_used = false;
886
887 for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
888 if (curp->specific[cnt] != NULL)
889 {
890 memset (curp->specific[cnt], '\0',
891 sizeof (curp->specific_1stblock));
892
893 /* We have allocated the block which we do not
894 free here so re-set the bit. */
895 curp->specific_used = true;
896 }
897 }
898 }
899 }
900
901 /* Add the stack of all running threads to the cache. */
902 list_splice (&stack_used, &stack_cache);
903
904 /* Remove the entry for the current thread to from the cache list
905 and add it to the list of running threads. Which of the two
906 lists is decided by the user_stack flag. */
907 stack_list_del (&self->list);
908
909 /* Re-initialize the lists for all the threads. */
910 INIT_LIST_HEAD (&stack_used);
911 INIT_LIST_HEAD (&__stack_user);
912
913 if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
914 list_add (&self->list, &__stack_user);
915 else
916 list_add (&self->list, &stack_used);
917
918 /* There is one thread running. */
919 __nptl_nthreads = 1;
920
921 in_flight_stack = 0;
922
923 /* Initialize locks. */
924 stack_cache_lock = LLL_LOCK_INITIALIZER;
925 __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
926}
927
928
929#if HP_TIMING_AVAIL
930# undef __find_thread_by_id
931/* Find a thread given the thread ID. */
932attribute_hidden
933struct pthread *
934__find_thread_by_id (pid_t tid)
935{
936 struct pthread *result = NULL;
937
938 lll_lock (stack_cache_lock, LLL_PRIVATE);
939
940 /* Iterate over the list with system-allocated threads first. */
941 list_t *runp;
942 list_for_each (runp, &stack_used)
943 {
944 struct pthread *curp;
945
946 curp = list_entry (runp, struct pthread, list);
947
948 if (curp->tid == tid)
949 {
950 result = curp;
951 goto out;
952 }
953 }
954
955 /* Now the list with threads using user-allocated stacks. */
956 list_for_each (runp, &__stack_user)
957 {
958 struct pthread *curp;
959
960 curp = list_entry (runp, struct pthread, list);
961
962 if (curp->tid == tid)
963 {
964 result = curp;
965 goto out;
966 }
967 }
968
969 out:
970 lll_unlock (stack_cache_lock, LLL_PRIVATE);
971
972 return result;
973}
974#endif
975
976
977#ifdef SIGSETXID
978static void
979internal_function
980setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
981{
982 int ch;
983
984 /* Wait until this thread is cloned. */
985 if (t->setxid_futex == -1
986 && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
987 do
988 futex_wait_simple (&t->setxid_futex, -2, FUTEX_PRIVATE);
989 while (t->setxid_futex == -2);
990
991 /* Don't let the thread exit before the setxid handler runs. */
992 t->setxid_futex = 0;
993
994 do
995 {
996 ch = t->cancelhandling;
997
998 /* If the thread is exiting right now, ignore it. */
999 if ((ch & EXITING_BITMASK) != 0)
1000 {
1001 /* Release the futex if there is no other setxid in
1002 progress. */
1003 if ((ch & SETXID_BITMASK) == 0)
1004 {
1005 t->setxid_futex = 1;
1006 futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
1007 }
1008 return;
1009 }
1010 }
1011 while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1012 ch | SETXID_BITMASK, ch));
1013}
1014
1015
1016static void
1017internal_function
1018setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1019{
1020 int ch;
1021
1022 do
1023 {
1024 ch = t->cancelhandling;
1025 if ((ch & SETXID_BITMASK) == 0)
1026 return;
1027 }
1028 while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1029 ch & ~SETXID_BITMASK, ch));
1030
1031 /* Release the futex just in case. */
1032 t->setxid_futex = 1;
1033 futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
1034}
1035
1036
1037static int
1038internal_function
1039setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1040{
1041 if ((t->cancelhandling & SETXID_BITMASK) == 0)
1042 return 0;
1043
1044 int val;
1045 pid_t pid = __getpid ();
1046 INTERNAL_SYSCALL_DECL (err);
1047 val = INTERNAL_SYSCALL_CALL (tgkill, err, pid, t->tid, SIGSETXID);
1048
1049 /* If this failed, it must have had not started yet or else exited. */
1050 if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1051 {
1052 atomic_increment (&cmdp->cntr);
1053 return 1;
1054 }
1055 else
1056 return 0;
1057}
1058
1059/* Check for consistency across set*id system call results. The abort
1060 should not happen as long as all privileges changes happen through
1061 the glibc wrappers. ERROR must be 0 (no error) or an errno
1062 code. */
1063void
1064attribute_hidden
1065__nptl_setxid_error (struct xid_command *cmdp, int error)
1066{
1067 do
1068 {
1069 int olderror = cmdp->error;
1070 if (olderror == error)
1071 break;
1072 if (olderror != -1)
1073 /* Mismatch between current and previous results. */
1074 abort ();
1075 }
1076 while (atomic_compare_and_exchange_bool_acq (&cmdp->error, error, -1));
1077}
1078
1079int
1080attribute_hidden
1081__nptl_setxid (struct xid_command *cmdp)
1082{
1083 int signalled;
1084 int result;
1085 lll_lock (stack_cache_lock, LLL_PRIVATE);
1086
1087 __xidcmd = cmdp;
1088 cmdp->cntr = 0;
1089 cmdp->error = -1;
1090
1091 struct pthread *self = THREAD_SELF;
1092
1093 /* Iterate over the list with system-allocated threads first. */
1094 list_t *runp;
1095 list_for_each (runp, &stack_used)
1096 {
1097 struct pthread *t = list_entry (runp, struct pthread, list);
1098 if (t == self)
1099 continue;
1100
1101 setxid_mark_thread (cmdp, t);
1102 }
1103
1104 /* Now the list with threads using user-allocated stacks. */
1105 list_for_each (runp, &__stack_user)
1106 {
1107 struct pthread *t = list_entry (runp, struct pthread, list);
1108 if (t == self)
1109 continue;
1110
1111 setxid_mark_thread (cmdp, t);
1112 }
1113
1114 /* Iterate until we don't succeed in signalling anyone. That means
1115 we have gotten all running threads, and their children will be
1116 automatically correct once started. */
1117 do
1118 {
1119 signalled = 0;
1120
1121 list_for_each (runp, &stack_used)
1122 {
1123 struct pthread *t = list_entry (runp, struct pthread, list);
1124 if (t == self)
1125 continue;
1126
1127 signalled += setxid_signal_thread (cmdp, t);
1128 }
1129
1130 list_for_each (runp, &__stack_user)
1131 {
1132 struct pthread *t = list_entry (runp, struct pthread, list);
1133 if (t == self)
1134 continue;
1135
1136 signalled += setxid_signal_thread (cmdp, t);
1137 }
1138
1139 int cur = cmdp->cntr;
1140 while (cur != 0)
1141 {
1142 futex_wait_simple ((unsigned int *) &cmdp->cntr, cur,
1143 FUTEX_PRIVATE);
1144 cur = cmdp->cntr;
1145 }
1146 }
1147 while (signalled != 0);
1148
1149 /* Clean up flags, so that no thread blocks during exit waiting
1150 for a signal which will never come. */
1151 list_for_each (runp, &stack_used)
1152 {
1153 struct pthread *t = list_entry (runp, struct pthread, list);
1154 if (t == self)
1155 continue;
1156
1157 setxid_unmark_thread (cmdp, t);
1158 }
1159
1160 list_for_each (runp, &__stack_user)
1161 {
1162 struct pthread *t = list_entry (runp, struct pthread, list);
1163 if (t == self)
1164 continue;
1165
1166 setxid_unmark_thread (cmdp, t);
1167 }
1168
1169 /* This must be last, otherwise the current thread might not have
1170 permissions to send SIGSETXID syscall to the other threads. */
1171 INTERNAL_SYSCALL_DECL (err);
1172 result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1173 cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1174 int error = 0;
1175 if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (result, err)))
1176 {
1177 error = INTERNAL_SYSCALL_ERRNO (result, err);
1178 __set_errno (error);
1179 result = -1;
1180 }
1181 __nptl_setxid_error (cmdp, error);
1182
1183 lll_unlock (stack_cache_lock, LLL_PRIVATE);
1184 return result;
1185}
1186#endif /* SIGSETXID. */
1187
1188
1189static inline void __attribute__((always_inline))
1190init_one_static_tls (struct pthread *curp, struct link_map *map)
1191{
1192# if TLS_TCB_AT_TP
1193 void *dest = (char *) curp - map->l_tls_offset;
1194# elif TLS_DTV_AT_TP
1195 void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1196# else
1197# error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1198# endif
1199
1200 /* We cannot delay the initialization of the Static TLS area, since
1201 it can be accessed with LE or IE, but since the DTV is only used
1202 by GD and LD, we can delay its update to avoid a race. */
1203 memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1204 '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1205}
1206
1207void
1208attribute_hidden
1209__pthread_init_static_tls (struct link_map *map)
1210{
1211 lll_lock (stack_cache_lock, LLL_PRIVATE);
1212
1213 /* Iterate over the list with system-allocated threads first. */
1214 list_t *runp;
1215 list_for_each (runp, &stack_used)
1216 init_one_static_tls (list_entry (runp, struct pthread, list), map);
1217
1218 /* Now the list with threads using user-allocated stacks. */
1219 list_for_each (runp, &__stack_user)
1220 init_one_static_tls (list_entry (runp, struct pthread, list), map);
1221
1222 lll_unlock (stack_cache_lock, LLL_PRIVATE);
1223}
1224
1225
1226void
1227attribute_hidden
1228__wait_lookup_done (void)
1229{
1230 lll_lock (stack_cache_lock, LLL_PRIVATE);
1231
1232 struct pthread *self = THREAD_SELF;
1233
1234 /* Iterate over the list with system-allocated threads first. */
1235 list_t *runp;
1236 list_for_each (runp, &stack_used)
1237 {
1238 struct pthread *t = list_entry (runp, struct pthread, list);
1239 if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1240 continue;
1241
1242 int *const gscope_flagp = &t->header.gscope_flag;
1243
1244 /* We have to wait until this thread is done with the global
1245 scope. First tell the thread that we are waiting and
1246 possibly have to be woken. */
1247 if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1248 THREAD_GSCOPE_FLAG_WAIT,
1249 THREAD_GSCOPE_FLAG_USED))
1250 continue;
1251
1252 do
1253 futex_wait_simple ((unsigned int *) gscope_flagp,
1254 THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1255 while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1256 }
1257
1258 /* Now the list with threads using user-allocated stacks. */
1259 list_for_each (runp, &__stack_user)
1260 {
1261 struct pthread *t = list_entry (runp, struct pthread, list);
1262 if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1263 continue;
1264
1265 int *const gscope_flagp = &t->header.gscope_flag;
1266
1267 /* We have to wait until this thread is done with the global
1268 scope. First tell the thread that we are waiting and
1269 possibly have to be woken. */
1270 if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1271 THREAD_GSCOPE_FLAG_WAIT,
1272 THREAD_GSCOPE_FLAG_USED))
1273 continue;
1274
1275 do
1276 futex_wait_simple ((unsigned int *) gscope_flagp,
1277 THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1278 while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1279 }
1280
1281 lll_unlock (stack_cache_lock, LLL_PRIVATE);
1282}
1283