LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #include "kmp_utils.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 #if OMPD_SUPPORT
36 #include "ompd-specific.h"
37 #endif
38 
39 #if OMP_PROFILING_SUPPORT
40 #include "llvm/Support/TimeProfiler.h"
41 static char *ProfileTraceFile = nullptr;
42 #endif
43 
44 /* these are temporary issues to be dealt with */
45 #define KMP_USE_PRCTL 0
46 
47 #if KMP_OS_WINDOWS
48 #include <process.h>
49 #endif
50 
51 #ifndef KMP_USE_SHM
52 // Windows and WASI do not need these include files as they don't use shared
53 // memory.
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63  KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71  KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87  int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89  kmp_internal_control_t *new_icvs,
90  ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93  int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99  kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113  int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115 
116 /* Calculate the identifier of the current thread */
117 /* fast (and somewhat portable) way to get unique identifier of executing
118  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
119 int __kmp_get_global_thread_id() {
120  int i;
121  kmp_info_t **other_threads;
122  size_t stack_data;
123  char *stack_addr;
124  size_t stack_size;
125  char *stack_base;
126 
127  KA_TRACE(
128  1000,
129  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
130  __kmp_nth, __kmp_all_nth));
131 
132  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
133  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
134  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
135  __kmp_init_gtid for this to work. */
136 
137  if (!TCR_4(__kmp_init_gtid))
138  return KMP_GTID_DNE;
139 
140 #ifdef KMP_TDATA_GTID
141  if (TCR_4(__kmp_gtid_mode) >= 3) {
142  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
143  return __kmp_gtid;
144  }
145 #endif
146  if (TCR_4(__kmp_gtid_mode) >= 2) {
147  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
148  return __kmp_gtid_get_specific();
149  }
150  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
151 
152  stack_addr = (char *)&stack_data;
153  other_threads = __kmp_threads;
154 
155  /* ATT: The code below is a source of potential bugs due to unsynchronized
156  access to __kmp_threads array. For example:
157  1. Current thread loads other_threads[i] to thr and checks it, it is
158  non-NULL.
159  2. Current thread is suspended by OS.
160  3. Another thread unregisters and finishes (debug versions of free()
161  may fill memory with something like 0xEF).
162  4. Current thread is resumed.
163  5. Current thread reads junk from *thr.
164  TODO: Fix it. --ln */
165 
166  for (i = 0; i < __kmp_threads_capacity; i++) {
167 
168  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
169  if (!thr)
170  continue;
171 
172  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
173  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
174 
175  /* stack grows down -- search through all of the active threads */
176 
177  if (stack_addr <= stack_base) {
178  size_t stack_diff = stack_base - stack_addr;
179 
180  if (stack_diff <= stack_size) {
181  /* The only way we can be closer than the allocated */
182  /* stack size is if we are running on this thread. */
183  // __kmp_gtid_get_specific can return negative value because this
184  // function can be called by thread destructor. However, before the
185  // thread destructor is called, the value of the corresponding
186  // thread-specific data will be reset to NULL.
187  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
188  __kmp_gtid_get_specific() == i);
189  return i;
190  }
191  }
192  }
193 
194  /* get specific to try and determine our gtid */
195  KA_TRACE(1000,
196  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
197  "thread, using TLS\n"));
198  i = __kmp_gtid_get_specific();
199 
200  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
201 
202  /* if we havn't been assigned a gtid, then return code */
203  if (i < 0)
204  return i;
205 
206  // other_threads[i] can be nullptr at this point because the corresponding
207  // thread could have already been destructed. It can happen when this function
208  // is called in end library routine.
209  if (!TCR_SYNC_PTR(other_threads[i]))
210  return i;
211 
212  /* dynamically updated stack window for uber threads to avoid get_specific
213  call */
214  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
215  KMP_FATAL(StackOverflow, i);
216  }
217 
218  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  if (stack_addr > stack_base) {
220  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
221  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
222  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
223  stack_base);
224  } else {
225  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
226  stack_base - stack_addr);
227  }
228 
229  /* Reprint stack bounds for ubermaster since they have been refined */
230  if (__kmp_storage_map) {
231  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
232  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
233  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
234  other_threads[i]->th.th_info.ds.ds_stacksize,
235  "th_%d stack (refinement)", i);
236  }
237  return i;
238 }
239 
240 int __kmp_get_global_thread_id_reg() {
241  int gtid;
242 
243  if (!__kmp_init_serial) {
244  gtid = KMP_GTID_DNE;
245  } else
246 #ifdef KMP_TDATA_GTID
247  if (TCR_4(__kmp_gtid_mode) >= 3) {
248  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
249  gtid = __kmp_gtid;
250  } else
251 #endif
252  if (TCR_4(__kmp_gtid_mode) >= 2) {
253  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
254  gtid = __kmp_gtid_get_specific();
255  } else {
256  KA_TRACE(1000,
257  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
258  gtid = __kmp_get_global_thread_id();
259  }
260 
261  /* we must be a new uber master sibling thread */
262  if (gtid == KMP_GTID_DNE) {
263  KA_TRACE(10,
264  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
265  "Registering a new gtid.\n"));
266  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
267  if (!__kmp_init_serial) {
268  __kmp_do_serial_initialize();
269  gtid = __kmp_gtid_get_specific();
270  } else {
271  gtid = __kmp_register_root(FALSE);
272  }
273  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
274  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
275  }
276 
277  KMP_DEBUG_ASSERT(gtid >= 0);
278 
279  return gtid;
280 }
281 
282 /* caller must hold forkjoin_lock */
283 void __kmp_check_stack_overlap(kmp_info_t *th) {
284  int f;
285  char *stack_beg = NULL;
286  char *stack_end = NULL;
287  int gtid;
288 
289  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
290  if (__kmp_storage_map) {
291  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
292  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
293 
294  gtid = __kmp_gtid_from_thread(th);
295 
296  if (gtid == KMP_GTID_MONITOR) {
297  __kmp_print_storage_map_gtid(
298  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
299  "th_%s stack (%s)", "mon",
300  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
301  } else {
302  __kmp_print_storage_map_gtid(
303  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
304  "th_%d stack (%s)", gtid,
305  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
306  }
307  }
308 
309  /* No point in checking ubermaster threads since they use refinement and
310  * cannot overlap */
311  gtid = __kmp_gtid_from_thread(th);
312  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
313  KA_TRACE(10,
314  ("__kmp_check_stack_overlap: performing extensive checking\n"));
315  if (stack_beg == NULL) {
316  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
317  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
318  }
319 
320  for (f = 0; f < __kmp_threads_capacity; f++) {
321  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
322 
323  if (f_th && f_th != th) {
324  char *other_stack_end =
325  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
326  char *other_stack_beg =
327  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
328  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
329  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
330 
331  /* Print the other stack values before the abort */
332  if (__kmp_storage_map)
333  __kmp_print_storage_map_gtid(
334  -1, other_stack_beg, other_stack_end,
335  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
336  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
337 
338  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
339  __kmp_msg_null);
340  }
341  }
342  }
343  }
344  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
345 }
346 
347 /* ------------------------------------------------------------------------ */
348 
349 void __kmp_infinite_loop(void) {
350  static int done = FALSE;
351 
352  while (!done) {
353  KMP_YIELD(TRUE);
354  }
355 }
356 
357 #define MAX_MESSAGE 512
358 
359 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
360  char const *format, ...) {
361  char buffer[MAX_MESSAGE];
362  va_list ap;
363 
364  va_start(ap, format);
365  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
366  p2, (unsigned long)size, format);
367  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
368  __kmp_vprintf(kmp_err, buffer, ap);
369 #if KMP_PRINT_DATA_PLACEMENT
370  int node;
371  if (gtid >= 0) {
372  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
373  if (__kmp_storage_map_verbose) {
374  node = __kmp_get_host_node(p1);
375  if (node < 0) /* doesn't work, so don't try this next time */
376  __kmp_storage_map_verbose = FALSE;
377  else {
378  char *last;
379  int lastNode;
380  int localProc = __kmp_get_cpu_from_gtid(gtid);
381 
382  const int page_size = KMP_GET_PAGE_SIZE();
383 
384  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
385  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
386  if (localProc >= 0)
387  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
388  localProc >> 1);
389  else
390  __kmp_printf_no_lock(" GTID %d\n", gtid);
391 #if KMP_USE_PRCTL
392  /* The more elaborate format is disabled for now because of the prctl
393  * hanging bug. */
394  do {
395  last = p1;
396  lastNode = node;
397  /* This loop collates adjacent pages with the same host node. */
398  do {
399  (char *)p1 += page_size;
400  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
401  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
402  lastNode);
403  } while (p1 <= p2);
404 #else
405  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
406  (char *)p1 + (page_size - 1),
407  __kmp_get_host_node(p1));
408  if (p1 < p2) {
409  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
410  (char *)p2 + (page_size - 1),
411  __kmp_get_host_node(p2));
412  }
413 #endif
414  }
415  }
416  } else
417  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
418  }
419 #endif /* KMP_PRINT_DATA_PLACEMENT */
420  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
421 
422  va_end(ap);
423 }
424 
425 void __kmp_warn(char const *format, ...) {
426  char buffer[MAX_MESSAGE];
427  va_list ap;
428 
429  if (__kmp_generate_warnings == kmp_warnings_off) {
430  return;
431  }
432 
433  va_start(ap, format);
434 
435  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
436  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
437  __kmp_vprintf(kmp_err, buffer, ap);
438  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
439 
440  va_end(ap);
441 }
442 
443 void __kmp_abort_process() {
444  // Later threads may stall here, but that's ok because abort() will kill them.
445  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
446 
447  if (__kmp_debug_buf) {
448  __kmp_dump_debug_buffer();
449  }
450 
451 #if KMP_OS_WINDOWS
452  // Let other threads know of abnormal termination and prevent deadlock
453  // if abort happened during library initialization or shutdown
454  __kmp_global.g.g_abort = SIGABRT;
455 
456  /* On Windows* OS by default abort() causes pop-up error box, which stalls
457  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
458  boxes. _set_abort_behavior() works well, but this function is not
459  available in VS7 (this is not problem for DLL, but it is a problem for
460  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
461  help, at least in some versions of MS C RTL.
462 
463  It seems following sequence is the only way to simulate abort() and
464  avoid pop-up error box. */
465  raise(SIGABRT);
466  _exit(3); // Just in case, if signal ignored, exit anyway.
467 #else
468  __kmp_unregister_library();
469  abort();
470 #endif
471 
472  __kmp_infinite_loop();
473  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
474 
475 } // __kmp_abort_process
476 
477 void __kmp_abort_thread(void) {
478  // TODO: Eliminate g_abort global variable and this function.
479  // In case of abort just call abort(), it will kill all the threads.
480  __kmp_infinite_loop();
481 } // __kmp_abort_thread
482 
483 /* Print out the storage map for the major kmp_info_t thread data structures
484  that are allocated together. */
485 
486 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
487  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
488  gtid);
489 
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
491  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
492 
493  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
494  sizeof(kmp_local_t), "th_%d.th_local", gtid);
495 
496  __kmp_print_storage_map_gtid(
497  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
498  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
499 
500  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
501  &thr->th.th_bar[bs_plain_barrier + 1],
502  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
503  gtid);
504 
505  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
506  &thr->th.th_bar[bs_forkjoin_barrier + 1],
507  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
508  gtid);
509 
510 #if KMP_FAST_REDUCTION_BARRIER
511  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
512  &thr->th.th_bar[bs_reduction_barrier + 1],
513  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
514  gtid);
515 #endif // KMP_FAST_REDUCTION_BARRIER
516 }
517 
518 /* Print out the storage map for the major kmp_team_t team data structures
519  that are allocated together. */
520 
521 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
522  int team_id, int num_thr) {
523  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
524  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
525  header, team_id);
526 
527  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
528  &team->t.t_bar[bs_last_barrier],
529  sizeof(kmp_balign_team_t) * bs_last_barrier,
530  "%s_%d.t_bar", header, team_id);
531 
532  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
533  &team->t.t_bar[bs_plain_barrier + 1],
534  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
535  header, team_id);
536 
537  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
538  &team->t.t_bar[bs_forkjoin_barrier + 1],
539  sizeof(kmp_balign_team_t),
540  "%s_%d.t_bar[forkjoin]", header, team_id);
541 
542 #if KMP_FAST_REDUCTION_BARRIER
543  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
544  &team->t.t_bar[bs_reduction_barrier + 1],
545  sizeof(kmp_balign_team_t),
546  "%s_%d.t_bar[reduction]", header, team_id);
547 #endif // KMP_FAST_REDUCTION_BARRIER
548 
549  __kmp_print_storage_map_gtid(
550  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
551  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
552 
553  __kmp_print_storage_map_gtid(
554  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
555  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
556 
557  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
558  &team->t.t_disp_buffer[num_disp_buff],
559  sizeof(dispatch_shared_info_t) * num_disp_buff,
560  "%s_%d.t_disp_buffer", header, team_id);
561 }
562 
563 static void __kmp_init_allocator() {
564  __kmp_init_memkind();
565  __kmp_init_target_mem();
566 }
567 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
568 
569 /* ------------------------------------------------------------------------ */
570 
571 #if ENABLE_LIBOMPTARGET
572 static void __kmp_init_omptarget() {
573  __kmp_init_target_task();
574 }
575 #endif
576 
577 /* ------------------------------------------------------------------------ */
578 
579 #if KMP_DYNAMIC_LIB
580 #if KMP_OS_WINDOWS
581 
582 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
583  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
584 
585  switch (fdwReason) {
586 
587  case DLL_PROCESS_ATTACH:
588  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
589 
590  return TRUE;
591 
592  case DLL_PROCESS_DETACH:
593  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
594 
595  // According to Windows* documentation for DllMain entry point:
596  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
597  // lpReserved == NULL when FreeLibrary() is called,
598  // lpReserved != NULL when the process is terminated.
599  // When FreeLibrary() is called, worker threads remain alive. So the
600  // runtime's state is consistent and executing proper shutdown is OK.
601  // When the process is terminated, worker threads have exited or been
602  // forcefully terminated by the OS and only the shutdown thread remains.
603  // This can leave the runtime in an inconsistent state.
604  // Hence, only attempt proper cleanup when FreeLibrary() is called.
605  // Otherwise, rely on OS to reclaim resources.
606  if (lpReserved == NULL)
607  __kmp_internal_end_library(__kmp_gtid_get_specific());
608 
609  return TRUE;
610 
611  case DLL_THREAD_ATTACH:
612  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
613 
614  /* if we want to register new siblings all the time here call
615  * __kmp_get_gtid(); */
616  return TRUE;
617 
618  case DLL_THREAD_DETACH:
619  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
620 
621  __kmp_internal_end_thread(__kmp_gtid_get_specific());
622  return TRUE;
623  }
624 
625  return TRUE;
626 }
627 
628 #endif /* KMP_OS_WINDOWS */
629 #endif /* KMP_DYNAMIC_LIB */
630 
631 /* __kmp_parallel_deo -- Wait until it's our turn. */
632 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
633  int gtid = *gtid_ref;
634 #ifdef BUILD_PARALLEL_ORDERED
635  kmp_team_t *team = __kmp_team_from_gtid(gtid);
636 #endif /* BUILD_PARALLEL_ORDERED */
637 
638  if (__kmp_env_consistency_check) {
639  if (__kmp_threads[gtid]->th.th_root->r.r_active)
640 #if KMP_USE_DYNAMIC_LOCK
641  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
642 #else
643  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
644 #endif
645  }
646 #ifdef BUILD_PARALLEL_ORDERED
647  if (!team->t.t_serialized) {
648  KMP_MB();
649  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
650  NULL);
651  KMP_MB();
652  }
653 #endif /* BUILD_PARALLEL_ORDERED */
654 }
655 
656 /* __kmp_parallel_dxo -- Signal the next task. */
657 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
658  int gtid = *gtid_ref;
659 #ifdef BUILD_PARALLEL_ORDERED
660  int tid = __kmp_tid_from_gtid(gtid);
661  kmp_team_t *team = __kmp_team_from_gtid(gtid);
662 #endif /* BUILD_PARALLEL_ORDERED */
663 
664  if (__kmp_env_consistency_check) {
665  if (__kmp_threads[gtid]->th.th_root->r.r_active)
666  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
667  }
668 #ifdef BUILD_PARALLEL_ORDERED
669  if (!team->t.t_serialized) {
670  KMP_MB(); /* Flush all pending memory write invalidates. */
671 
672  /* use the tid of the next thread in this team */
673  /* TODO replace with general release procedure */
674  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
675 
676  KMP_MB(); /* Flush all pending memory write invalidates. */
677  }
678 #endif /* BUILD_PARALLEL_ORDERED */
679 }
680 
681 /* ------------------------------------------------------------------------ */
682 /* The BARRIER for a SINGLE process section is always explicit */
683 
684 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
685  int status;
686  kmp_info_t *th;
687  kmp_team_t *team;
688 
689  if (!TCR_4(__kmp_init_parallel))
690  __kmp_parallel_initialize();
691  __kmp_resume_if_soft_paused();
692 
693  th = __kmp_threads[gtid];
694  team = th->th.th_team;
695  status = 0;
696 
697  th->th.th_ident = id_ref;
698 
699  if (team->t.t_serialized) {
700  status = 1;
701  } else {
702  kmp_int32 old_this = th->th.th_local.this_construct;
703 
704  ++th->th.th_local.this_construct;
705  /* try to set team count to thread count--success means thread got the
706  single block */
707  /* TODO: Should this be acquire or release? */
708  if (team->t.t_construct == old_this) {
709  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
710  th->th.th_local.this_construct);
711  }
712 #if USE_ITT_BUILD
713  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
714  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
715  team->t.t_active_level == 1) {
716  // Only report metadata by primary thread of active team at level 1
717  __kmp_itt_metadata_single(id_ref);
718  }
719 #endif /* USE_ITT_BUILD */
720  }
721 
722  if (__kmp_env_consistency_check) {
723  if (status && push_ws) {
724  __kmp_push_workshare(gtid, ct_psingle, id_ref);
725  } else {
726  __kmp_check_workshare(gtid, ct_psingle, id_ref);
727  }
728  }
729 #if USE_ITT_BUILD
730  if (status) {
731  __kmp_itt_single_start(gtid);
732  }
733 #endif /* USE_ITT_BUILD */
734  return status;
735 }
736 
737 void __kmp_exit_single(int gtid) {
738 #if USE_ITT_BUILD
739  __kmp_itt_single_end(gtid);
740 #endif /* USE_ITT_BUILD */
741  if (__kmp_env_consistency_check)
742  __kmp_pop_workshare(gtid, ct_psingle, NULL);
743 }
744 
745 /* determine if we can go parallel or must use a serialized parallel region and
746  * how many threads we can use
747  * set_nproc is the number of threads requested for the team
748  * returns 0 if we should serialize or only use one thread,
749  * otherwise the number of threads to use
750  * The forkjoin lock is held by the caller. */
751 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
752  int master_tid, int set_nthreads,
753  int enter_teams) {
754  int capacity;
755  int new_nthreads;
756  KMP_DEBUG_ASSERT(__kmp_init_serial);
757  KMP_DEBUG_ASSERT(root && parent_team);
758  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
759 
760  // If dyn-var is set, dynamically adjust the number of desired threads,
761  // according to the method specified by dynamic_mode.
762  new_nthreads = set_nthreads;
763  if (!get__dynamic_2(parent_team, master_tid)) {
764  ;
765  }
766 #ifdef USE_LOAD_BALANCE
767  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
768  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
769  if (new_nthreads == 1) {
770  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
771  "reservation to 1 thread\n",
772  master_tid));
773  return 1;
774  }
775  if (new_nthreads < set_nthreads) {
776  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
777  "reservation to %d threads\n",
778  master_tid, new_nthreads));
779  }
780  }
781 #endif /* USE_LOAD_BALANCE */
782  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
783  new_nthreads = __kmp_avail_proc - __kmp_nth +
784  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
785  if (new_nthreads <= 1) {
786  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
787  "reservation to 1 thread\n",
788  master_tid));
789  return 1;
790  }
791  if (new_nthreads < set_nthreads) {
792  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
793  "reservation to %d threads\n",
794  master_tid, new_nthreads));
795  } else {
796  new_nthreads = set_nthreads;
797  }
798  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
799  if (set_nthreads > 2) {
800  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
801  new_nthreads = (new_nthreads % set_nthreads) + 1;
802  if (new_nthreads == 1) {
803  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
804  "reservation to 1 thread\n",
805  master_tid));
806  return 1;
807  }
808  if (new_nthreads < set_nthreads) {
809  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
810  "reservation to %d threads\n",
811  master_tid, new_nthreads));
812  }
813  }
814  } else {
815  KMP_ASSERT(0);
816  }
817 
818  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
819  if (__kmp_nth + new_nthreads -
820  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
821  __kmp_max_nth) {
822  int tl_nthreads = __kmp_max_nth - __kmp_nth +
823  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
824  if (tl_nthreads <= 0) {
825  tl_nthreads = 1;
826  }
827 
828  // If dyn-var is false, emit a 1-time warning.
829  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
830  __kmp_reserve_warn = 1;
831  __kmp_msg(kmp_ms_warning,
832  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
833  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
834  }
835  if (tl_nthreads == 1) {
836  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
837  "reduced reservation to 1 thread\n",
838  master_tid));
839  return 1;
840  }
841  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
842  "reservation to %d threads\n",
843  master_tid, tl_nthreads));
844  new_nthreads = tl_nthreads;
845  }
846 
847  // Respect OMP_THREAD_LIMIT
848  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
849  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
850  if (cg_nthreads + new_nthreads -
851  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
852  max_cg_threads) {
853  int tl_nthreads = max_cg_threads - cg_nthreads +
854  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
855  if (tl_nthreads <= 0) {
856  tl_nthreads = 1;
857  }
858 
859  // If dyn-var is false, emit a 1-time warning.
860  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
861  __kmp_reserve_warn = 1;
862  __kmp_msg(kmp_ms_warning,
863  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
864  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
865  }
866  if (tl_nthreads == 1) {
867  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
868  "reduced reservation to 1 thread\n",
869  master_tid));
870  return 1;
871  }
872  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
873  "reservation to %d threads\n",
874  master_tid, tl_nthreads));
875  new_nthreads = tl_nthreads;
876  }
877 
878  // Check if the threads array is large enough, or needs expanding.
879  // See comment in __kmp_register_root() about the adjustment if
880  // __kmp_threads[0] == NULL.
881  capacity = __kmp_threads_capacity;
882  if (TCR_PTR(__kmp_threads[0]) == NULL) {
883  --capacity;
884  }
885  // If it is not for initializing the hidden helper team, we need to take
886  // __kmp_hidden_helper_threads_num out of the capacity because it is included
887  // in __kmp_threads_capacity.
888  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
889  capacity -= __kmp_hidden_helper_threads_num;
890  }
891  if (__kmp_nth + new_nthreads -
892  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
893  capacity) {
894  // Expand the threads array.
895  int slotsRequired = __kmp_nth + new_nthreads -
896  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
897  capacity;
898  int slotsAdded = __kmp_expand_threads(slotsRequired);
899  if (slotsAdded < slotsRequired) {
900  // The threads array was not expanded enough.
901  new_nthreads -= (slotsRequired - slotsAdded);
902  KMP_ASSERT(new_nthreads >= 1);
903 
904  // If dyn-var is false, emit a 1-time warning.
905  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
906  __kmp_reserve_warn = 1;
907  if (__kmp_tp_cached) {
908  __kmp_msg(kmp_ms_warning,
909  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
910  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
911  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
912  } else {
913  __kmp_msg(kmp_ms_warning,
914  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
915  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
916  }
917  }
918  }
919  }
920 
921 #ifdef KMP_DEBUG
922  if (new_nthreads == 1) {
923  KC_TRACE(10,
924  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
925  "dead roots and rechecking; requested %d threads\n",
926  __kmp_get_gtid(), set_nthreads));
927  } else {
928  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
929  " %d threads\n",
930  __kmp_get_gtid(), new_nthreads, set_nthreads));
931  }
932 #endif // KMP_DEBUG
933  return new_nthreads;
934 }
935 
936 /* Allocate threads from the thread pool and assign them to the new team. We are
937  assured that there are enough threads available, because we checked on that
938  earlier within critical section forkjoin */
939 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
940  kmp_info_t *master_th, int master_gtid,
941  int fork_teams_workers) {
942  int i;
943  int use_hot_team;
944 
945  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
946  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
947  KMP_MB();
948 
949  /* first, let's setup the primary thread */
950  master_th->th.th_info.ds.ds_tid = 0;
951  master_th->th.th_team = team;
952  master_th->th.th_team_nproc = team->t.t_nproc;
953  master_th->th.th_team_master = master_th;
954  master_th->th.th_team_serialized = FALSE;
955  master_th->th.th_dispatch = &team->t.t_dispatch[0];
956 
957 /* make sure we are not the optimized hot team */
958 #if KMP_NESTED_HOT_TEAMS
959  use_hot_team = 0;
960  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
961  if (hot_teams) { // hot teams array is not allocated if
962  // KMP_HOT_TEAMS_MAX_LEVEL=0
963  int level = team->t.t_active_level - 1; // index in array of hot teams
964  if (master_th->th.th_teams_microtask) { // are we inside the teams?
965  if (master_th->th.th_teams_size.nteams > 1) {
966  ++level; // level was not increased in teams construct for
967  // team_of_masters
968  }
969  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
970  master_th->th.th_teams_level == team->t.t_level) {
971  ++level; // level was not increased in teams construct for
972  // team_of_workers before the parallel
973  } // team->t.t_level will be increased inside parallel
974  }
975  if (level < __kmp_hot_teams_max_level) {
976  if (hot_teams[level].hot_team) {
977  // hot team has already been allocated for given level
978  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
979  use_hot_team = 1; // the team is ready to use
980  } else {
981  use_hot_team = 0; // AC: threads are not allocated yet
982  hot_teams[level].hot_team = team; // remember new hot team
983  hot_teams[level].hot_team_nth = team->t.t_nproc;
984  }
985  } else {
986  use_hot_team = 0;
987  }
988  }
989 #else
990  use_hot_team = team == root->r.r_hot_team;
991 #endif
992  if (!use_hot_team) {
993 
994  /* install the primary thread */
995  team->t.t_threads[0] = master_th;
996  __kmp_initialize_info(master_th, team, 0, master_gtid);
997 
998  /* now, install the worker threads */
999  for (i = 1; i < team->t.t_nproc; i++) {
1000 
1001  /* fork or reallocate a new thread and install it in team */
1002  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1003  team->t.t_threads[i] = thr;
1004  KMP_DEBUG_ASSERT(thr);
1005  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1006  /* align team and thread arrived states */
1007  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1008  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1009  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1010  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1011  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1012  team->t.t_bar[bs_plain_barrier].b_arrived));
1013  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1014  thr->th.th_teams_level = master_th->th.th_teams_level;
1015  thr->th.th_teams_size = master_th->th.th_teams_size;
1016  { // Initialize threads' barrier data.
1017  int b;
1018  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1019  for (b = 0; b < bs_last_barrier; ++b) {
1020  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1021  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1022 #if USE_DEBUGGER
1023  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1024 #endif
1025  }
1026  }
1027  }
1028 
1029 #if KMP_AFFINITY_SUPPORTED
1030  // Do not partition the places list for teams construct workers who
1031  // haven't actually been forked to do real work yet. This partitioning
1032  // will take place in the parallel region nested within the teams construct.
1033  if (!fork_teams_workers) {
1034  __kmp_partition_places(team);
1035  }
1036 #endif
1037 
1038  if (team->t.t_nproc > 1 &&
1039  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1040  team->t.b->update_num_threads(team->t.t_nproc);
1041  __kmp_add_threads_to_team(team, team->t.t_nproc);
1042  }
1043  }
1044 
1045  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1046  for (i = 0; i < team->t.t_nproc; i++) {
1047  kmp_info_t *thr = team->t.t_threads[i];
1048  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1049  thr->th.th_prev_level != team->t.t_level) {
1050  team->t.t_display_affinity = 1;
1051  break;
1052  }
1053  }
1054  }
1055 
1056  KMP_MB();
1057 }
1058 
1059 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1060 // Propagate any changes to the floating point control registers out to the team
1061 // We try to avoid unnecessary writes to the relevant cache line in the team
1062 // structure, so we don't make changes unless they are needed.
1063 inline static void propagateFPControl(kmp_team_t *team) {
1064  if (__kmp_inherit_fp_control) {
1065  kmp_int16 x87_fpu_control_word;
1066  kmp_uint32 mxcsr;
1067 
1068  // Get primary thread's values of FPU control flags (both X87 and vector)
1069  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1070  __kmp_store_mxcsr(&mxcsr);
1071  mxcsr &= KMP_X86_MXCSR_MASK;
1072 
1073  // There is no point looking at t_fp_control_saved here.
1074  // If it is TRUE, we still have to update the values if they are different
1075  // from those we now have. If it is FALSE we didn't save anything yet, but
1076  // our objective is the same. We have to ensure that the values in the team
1077  // are the same as those we have.
1078  // So, this code achieves what we need whether or not t_fp_control_saved is
1079  // true. By checking whether the value needs updating we avoid unnecessary
1080  // writes that would put the cache-line into a written state, causing all
1081  // threads in the team to have to read it again.
1082  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1083  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1084  // Although we don't use this value, other code in the runtime wants to know
1085  // whether it should restore them. So we must ensure it is correct.
1086  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1087  } else {
1088  // Similarly here. Don't write to this cache-line in the team structure
1089  // unless we have to.
1090  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1091  }
1092 }
1093 
1094 // Do the opposite, setting the hardware registers to the updated values from
1095 // the team.
1096 inline static void updateHWFPControl(kmp_team_t *team) {
1097  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1098  // Only reset the fp control regs if they have been changed in the team.
1099  // the parallel region that we are exiting.
1100  kmp_int16 x87_fpu_control_word;
1101  kmp_uint32 mxcsr;
1102  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1103  __kmp_store_mxcsr(&mxcsr);
1104  mxcsr &= KMP_X86_MXCSR_MASK;
1105 
1106  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1107  __kmp_clear_x87_fpu_status_word();
1108  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1109  }
1110 
1111  if (team->t.t_mxcsr != mxcsr) {
1112  __kmp_load_mxcsr(&team->t.t_mxcsr);
1113  }
1114  }
1115 }
1116 #else
1117 #define propagateFPControl(x) ((void)0)
1118 #define updateHWFPControl(x) ((void)0)
1119 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1120 
1121 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1122  int realloc); // forward declaration
1123 
1124 /* Run a parallel region that has been serialized, so runs only in a team of the
1125  single primary thread. */
1126 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1127  kmp_info_t *this_thr;
1128  kmp_team_t *serial_team;
1129 
1130  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1131 
1132  /* Skip all this code for autopar serialized loops since it results in
1133  unacceptable overhead */
1134  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1135  return;
1136 
1137  if (!TCR_4(__kmp_init_parallel))
1138  __kmp_parallel_initialize();
1139  __kmp_resume_if_soft_paused();
1140 
1141  this_thr = __kmp_threads[global_tid];
1142  serial_team = this_thr->th.th_serial_team;
1143 
1144  /* utilize the serialized team held by this thread */
1145  KMP_DEBUG_ASSERT(serial_team);
1146  KMP_MB();
1147 
1148  if (__kmp_tasking_mode != tskm_immediate_exec) {
1149  KMP_DEBUG_ASSERT(
1150  this_thr->th.th_task_team ==
1151  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1152  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1153  NULL);
1154  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1155  "team %p, new task_team = NULL\n",
1156  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1157  this_thr->th.th_task_team = NULL;
1158  }
1159 
1160  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1161  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1162  proc_bind = proc_bind_false;
1163  } else if (proc_bind == proc_bind_default) {
1164  // No proc_bind clause was specified, so use the current value
1165  // of proc-bind-var for this parallel region.
1166  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1167  }
1168  // Reset for next parallel region
1169  this_thr->th.th_set_proc_bind = proc_bind_default;
1170 
1171  // Reset num_threads for next parallel region
1172  this_thr->th.th_set_nproc = 0;
1173 
1174 #if OMPT_SUPPORT
1175  ompt_data_t ompt_parallel_data = ompt_data_none;
1176  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1177  if (ompt_enabled.enabled &&
1178  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1179 
1180  ompt_task_info_t *parent_task_info;
1181  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1182 
1183  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1184  if (ompt_enabled.ompt_callback_parallel_begin) {
1185  int team_size = 1;
1186 
1187  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1188  &(parent_task_info->task_data), &(parent_task_info->frame),
1189  &ompt_parallel_data, team_size,
1190  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1191  }
1192  }
1193 #endif // OMPT_SUPPORT
1194 
1195  if (this_thr->th.th_team != serial_team) {
1196  // Nested level will be an index in the nested nthreads array
1197  int level = this_thr->th.th_team->t.t_level;
1198 
1199  if (serial_team->t.t_serialized) {
1200  /* this serial team was already used
1201  TODO increase performance by making this locks more specific */
1202  kmp_team_t *new_team;
1203 
1204  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1205 
1206  new_team =
1207  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1208 #if OMPT_SUPPORT
1209  ompt_parallel_data,
1210 #endif
1211  proc_bind, &this_thr->th.th_current_task->td_icvs,
1212  0 USE_NESTED_HOT_ARG(NULL));
1213  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1214  KMP_ASSERT(new_team);
1215 
1216  /* setup new serialized team and install it */
1217  new_team->t.t_threads[0] = this_thr;
1218  new_team->t.t_parent = this_thr->th.th_team;
1219  serial_team = new_team;
1220  this_thr->th.th_serial_team = serial_team;
1221 
1222  KF_TRACE(
1223  10,
1224  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1225  global_tid, serial_team));
1226 
1227  /* TODO the above breaks the requirement that if we run out of resources,
1228  then we can still guarantee that serialized teams are ok, since we may
1229  need to allocate a new one */
1230  } else {
1231  KF_TRACE(
1232  10,
1233  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1234  global_tid, serial_team));
1235  }
1236 
1237  /* we have to initialize this serial team */
1238  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1239  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1240  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1241  serial_team->t.t_ident = loc;
1242  serial_team->t.t_serialized = 1;
1243  serial_team->t.t_nproc = 1;
1244  serial_team->t.t_parent = this_thr->th.th_team;
1245  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1246  this_thr->th.th_team = serial_team;
1247  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1248 
1249  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1250  this_thr->th.th_current_task));
1251  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1252  this_thr->th.th_current_task->td_flags.executing = 0;
1253 
1254  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1255 
1256  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1257  implicit task for each serialized task represented by
1258  team->t.t_serialized? */
1259  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1260  &this_thr->th.th_current_task->td_parent->td_icvs);
1261 
1262  // Thread value exists in the nested nthreads array for the next nested
1263  // level
1264  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1265  this_thr->th.th_current_task->td_icvs.nproc =
1266  __kmp_nested_nth.nth[level + 1];
1267  }
1268 
1269  if (__kmp_nested_proc_bind.used &&
1270  (level + 1 < __kmp_nested_proc_bind.used)) {
1271  this_thr->th.th_current_task->td_icvs.proc_bind =
1272  __kmp_nested_proc_bind.bind_types[level + 1];
1273  }
1274 
1275 #if USE_DEBUGGER
1276  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1277 #endif
1278  this_thr->th.th_info.ds.ds_tid = 0;
1279 
1280  /* set thread cache values */
1281  this_thr->th.th_team_nproc = 1;
1282  this_thr->th.th_team_master = this_thr;
1283  this_thr->th.th_team_serialized = 1;
1284 
1285  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1286  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1287  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1288 
1289  propagateFPControl(serial_team);
1290 
1291  /* check if we need to allocate dispatch buffers stack */
1292  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1293  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1294  serial_team->t.t_dispatch->th_disp_buffer =
1295  (dispatch_private_info_t *)__kmp_allocate(
1296  sizeof(dispatch_private_info_t));
1297  }
1298  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1299 
1300  KMP_MB();
1301 
1302  } else {
1303  /* this serialized team is already being used,
1304  * that's fine, just add another nested level */
1305  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1306  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1307  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1308  ++serial_team->t.t_serialized;
1309  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1310 
1311  // Nested level will be an index in the nested nthreads array
1312  int level = this_thr->th.th_team->t.t_level;
1313  // Thread value exists in the nested nthreads array for the next nested
1314  // level
1315  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1316  this_thr->th.th_current_task->td_icvs.nproc =
1317  __kmp_nested_nth.nth[level + 1];
1318  }
1319  serial_team->t.t_level++;
1320  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1321  "of serial team %p to %d\n",
1322  global_tid, serial_team, serial_team->t.t_level));
1323 
1324  /* allocate/push dispatch buffers stack */
1325  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1326  {
1327  dispatch_private_info_t *disp_buffer =
1328  (dispatch_private_info_t *)__kmp_allocate(
1329  sizeof(dispatch_private_info_t));
1330  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1331  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1332  }
1333  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1334 
1335  KMP_MB();
1336  }
1337  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1338 
1339  // Perform the display affinity functionality for
1340  // serialized parallel regions
1341  if (__kmp_display_affinity) {
1342  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1343  this_thr->th.th_prev_num_threads != 1) {
1344  // NULL means use the affinity-format-var ICV
1345  __kmp_aux_display_affinity(global_tid, NULL);
1346  this_thr->th.th_prev_level = serial_team->t.t_level;
1347  this_thr->th.th_prev_num_threads = 1;
1348  }
1349  }
1350 
1351  if (__kmp_env_consistency_check)
1352  __kmp_push_parallel(global_tid, NULL);
1353 #if OMPT_SUPPORT
1354  serial_team->t.ompt_team_info.master_return_address = codeptr;
1355  if (ompt_enabled.enabled &&
1356  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1357  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1358  OMPT_GET_FRAME_ADDRESS(0);
1359 
1360  ompt_lw_taskteam_t lw_taskteam;
1361  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1362  &ompt_parallel_data, codeptr);
1363 
1364  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1365  // don't use lw_taskteam after linking. content was swaped
1366 
1367  /* OMPT implicit task begin */
1368  if (ompt_enabled.ompt_callback_implicit_task) {
1369  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1370  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1371  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1372  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1373  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1374  __kmp_tid_from_gtid(global_tid);
1375  }
1376 
1377  /* OMPT state */
1378  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1379  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1380  OMPT_GET_FRAME_ADDRESS(0);
1381  }
1382 #endif
1383 }
1384 
1385 // Test if this fork is for a team closely nested in a teams construct
1386 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1387  microtask_t microtask, int level,
1388  int teams_level, kmp_va_list ap) {
1389  return (master_th->th.th_teams_microtask && ap &&
1390  microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1391 }
1392 
1393 // Test if this fork is for the teams construct, i.e. to form the outer league
1394 // of teams
1395 static inline bool __kmp_is_entering_teams(int active_level, int level,
1396  int teams_level, kmp_va_list ap) {
1397  return ((ap == NULL && active_level == 0) ||
1398  (ap && teams_level > 0 && teams_level == level));
1399 }
1400 
1401 // AC: This is start of parallel that is nested inside teams construct.
1402 // The team is actual (hot), all workers are ready at the fork barrier.
1403 // No lock needed to initialize the team a bit, then free workers.
1404 static inline int
1405 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1406  kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1407  enum fork_context_e call_context, microtask_t microtask,
1408  launch_t invoker, int master_set_numthreads, int level,
1409 #if OMPT_SUPPORT
1410  ompt_data_t ompt_parallel_data, void *return_address,
1411 #endif
1412  kmp_va_list ap) {
1413  void **argv;
1414  int i;
1415 
1416  parent_team->t.t_ident = loc;
1417  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1418  parent_team->t.t_argc = argc;
1419  argv = (void **)parent_team->t.t_argv;
1420  for (i = argc - 1; i >= 0; --i) {
1421  *argv++ = va_arg(kmp_va_deref(ap), void *);
1422  }
1423  // Increment our nested depth levels, but not increase the serialization
1424  if (parent_team == master_th->th.th_serial_team) {
1425  // AC: we are in serialized parallel
1426  __kmpc_serialized_parallel(loc, gtid);
1427  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1428 
1429  if (call_context == fork_context_gnu) {
1430  // AC: need to decrement t_serialized for enquiry functions to work
1431  // correctly, will restore at join time
1432  parent_team->t.t_serialized--;
1433  return TRUE;
1434  }
1435 
1436 #if OMPD_SUPPORT
1437  parent_team->t.t_pkfn = microtask;
1438 #endif
1439 
1440 #if OMPT_SUPPORT
1441  void *dummy;
1442  void **exit_frame_p;
1443  ompt_data_t *implicit_task_data;
1444  ompt_lw_taskteam_t lw_taskteam;
1445 
1446  if (ompt_enabled.enabled) {
1447  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1448  &ompt_parallel_data, return_address);
1449  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1450 
1451  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1452  // Don't use lw_taskteam after linking. Content was swapped.
1453 
1454  /* OMPT implicit task begin */
1455  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1456  if (ompt_enabled.ompt_callback_implicit_task) {
1457  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1458  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1459  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1460  1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1461  }
1462 
1463  /* OMPT state */
1464  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1465  } else {
1466  exit_frame_p = &dummy;
1467  }
1468 #endif
1469 
1470  // AC: need to decrement t_serialized for enquiry functions to work
1471  // correctly, will restore at join time
1472  parent_team->t.t_serialized--;
1473 
1474  {
1475  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1476  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1477  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1478 #if OMPT_SUPPORT
1479  ,
1480  exit_frame_p
1481 #endif
1482  );
1483  }
1484 
1485 #if OMPT_SUPPORT
1486  if (ompt_enabled.enabled) {
1487  *exit_frame_p = NULL;
1488  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1489  if (ompt_enabled.ompt_callback_implicit_task) {
1490  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1491  ompt_scope_end, NULL, implicit_task_data, 1,
1492  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1493  }
1494  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1495  __ompt_lw_taskteam_unlink(master_th);
1496  if (ompt_enabled.ompt_callback_parallel_end) {
1497  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1498  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1499  OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1500  }
1501  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1502  }
1503 #endif
1504  return TRUE;
1505  }
1506 
1507  parent_team->t.t_pkfn = microtask;
1508  parent_team->t.t_invoke = invoker;
1509  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1510  parent_team->t.t_active_level++;
1511  parent_team->t.t_level++;
1512  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1513 
1514  // If the threads allocated to the team are less than the thread limit, update
1515  // the thread limit here. th_teams_size.nth is specific to this team nested
1516  // in a teams construct, the team is fully created, and we're about to do
1517  // the actual fork. Best to do this here so that the subsequent uses below
1518  // and in the join have the correct value.
1519  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1520 
1521 #if OMPT_SUPPORT
1522  if (ompt_enabled.enabled) {
1523  ompt_lw_taskteam_t lw_taskteam;
1524  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1525  return_address);
1526  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1527  }
1528 #endif
1529 
1530  /* Change number of threads in the team if requested */
1531  if (master_set_numthreads) { // The parallel has num_threads clause
1532  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1533  // AC: only can reduce number of threads dynamically, can't increase
1534  kmp_info_t **other_threads = parent_team->t.t_threads;
1535  // NOTE: if using distributed barrier, we need to run this code block
1536  // even when the team size appears not to have changed from the max.
1537  int old_proc = master_th->th.th_teams_size.nth;
1538  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1539  __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1540  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1541  }
1542  parent_team->t.t_nproc = master_set_numthreads;
1543  for (i = 0; i < master_set_numthreads; ++i) {
1544  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1545  }
1546  }
1547  // Keep extra threads hot in the team for possible next parallels
1548  master_th->th.th_set_nproc = 0;
1549  }
1550 
1551 #if USE_DEBUGGER
1552  if (__kmp_debugging) { // Let debugger override number of threads.
1553  int nth = __kmp_omp_num_threads(loc);
1554  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1555  master_set_numthreads = nth;
1556  }
1557  }
1558 #endif
1559 
1560  // Figure out the proc_bind policy for the nested parallel within teams
1561  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1562  // proc_bind_default means don't update
1563  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1564  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1565  proc_bind = proc_bind_false;
1566  } else {
1567  // No proc_bind clause specified; use current proc-bind-var
1568  if (proc_bind == proc_bind_default) {
1569  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1570  }
1571  /* else: The proc_bind policy was specified explicitly on parallel clause.
1572  This overrides proc-bind-var for this parallel region, but does not
1573  change proc-bind-var. */
1574  // Figure the value of proc-bind-var for the child threads.
1575  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1576  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1577  master_th->th.th_current_task->td_icvs.proc_bind)) {
1578  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1579  }
1580  }
1581  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1582  // Need to change the bind-var ICV to correct value for each implicit task
1583  if (proc_bind_icv != proc_bind_default &&
1584  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1585  kmp_info_t **other_threads = parent_team->t.t_threads;
1586  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1587  other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1588  }
1589  }
1590  // Reset for next parallel region
1591  master_th->th.th_set_proc_bind = proc_bind_default;
1592 
1593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1594  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1595  KMP_ITT_DEBUG) &&
1596  __kmp_forkjoin_frames_mode == 3 &&
1597  parent_team->t.t_active_level == 1 // only report frames at level 1
1598  && master_th->th.th_teams_size.nteams == 1) {
1599  kmp_uint64 tmp_time = __itt_get_timestamp();
1600  master_th->th.th_frame_time = tmp_time;
1601  parent_team->t.t_region_time = tmp_time;
1602  }
1603  if (__itt_stack_caller_create_ptr) {
1604  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1605  // create new stack stitching id before entering fork barrier
1606  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1607  }
1608 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1609 #if KMP_AFFINITY_SUPPORTED
1610  __kmp_partition_places(parent_team);
1611 #endif
1612 
1613  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1614  "master_th=%p, gtid=%d\n",
1615  root, parent_team, master_th, gtid));
1616  __kmp_internal_fork(loc, gtid, parent_team);
1617  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1618  "master_th=%p, gtid=%d\n",
1619  root, parent_team, master_th, gtid));
1620 
1621  if (call_context == fork_context_gnu)
1622  return TRUE;
1623 
1624  /* Invoke microtask for PRIMARY thread */
1625  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1626  parent_team->t.t_id, parent_team->t.t_pkfn));
1627 
1628  if (!parent_team->t.t_invoke(gtid)) {
1629  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1630  }
1631  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1632  parent_team->t.t_id, parent_team->t.t_pkfn));
1633  KMP_MB(); /* Flush all pending memory write invalidates. */
1634 
1635  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1636 
1637  return TRUE;
1638 }
1639 
1640 // Create a serialized parallel region
1641 static inline int
1642 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1643  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1644  kmp_info_t *master_th, kmp_team_t *parent_team,
1645 #if OMPT_SUPPORT
1646  ompt_data_t *ompt_parallel_data, void **return_address,
1647  ompt_data_t **parent_task_data,
1648 #endif
1649  kmp_va_list ap) {
1650  kmp_team_t *team;
1651  int i;
1652  void **argv;
1653 
1654 /* josh todo: hypothetical question: what do we do for OS X*? */
1655 #if KMP_OS_LINUX && \
1656  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1657  SimpleVLA<void *> args(argc);
1658 #else
1659  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1660 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1661  KMP_ARCH_AARCH64) */
1662 
1663  KA_TRACE(
1664  20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1665 
1666  __kmpc_serialized_parallel(loc, gtid);
1667 
1668 #if OMPD_SUPPORT
1669  master_th->th.th_serial_team->t.t_pkfn = microtask;
1670 #endif
1671 
1672  if (call_context == fork_context_intel) {
1673  /* TODO this sucks, use the compiler itself to pass args! :) */
1674  master_th->th.th_serial_team->t.t_ident = loc;
1675  if (!ap) {
1676  // revert change made in __kmpc_serialized_parallel()
1677  master_th->th.th_serial_team->t.t_level--;
1678 // Get args from parent team for teams construct
1679 
1680 #if OMPT_SUPPORT
1681  void *dummy;
1682  void **exit_frame_p;
1683  ompt_task_info_t *task_info;
1684  ompt_lw_taskteam_t lw_taskteam;
1685 
1686  if (ompt_enabled.enabled) {
1687  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1688  ompt_parallel_data, *return_address);
1689 
1690  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1691  // don't use lw_taskteam after linking. content was swaped
1692  task_info = OMPT_CUR_TASK_INFO(master_th);
1693  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1694  if (ompt_enabled.ompt_callback_implicit_task) {
1695  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1696  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1697  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1698  &(task_info->task_data), 1,
1699  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1700  }
1701 
1702  /* OMPT state */
1703  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1704  } else {
1705  exit_frame_p = &dummy;
1706  }
1707 #endif
1708 
1709  {
1710  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1711  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1712  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1713 #if OMPT_SUPPORT
1714  ,
1715  exit_frame_p
1716 #endif
1717  );
1718  }
1719 
1720 #if OMPT_SUPPORT
1721  if (ompt_enabled.enabled) {
1722  *exit_frame_p = NULL;
1723  if (ompt_enabled.ompt_callback_implicit_task) {
1724  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1725  ompt_scope_end, NULL, &(task_info->task_data), 1,
1726  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1727  }
1728  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1729  __ompt_lw_taskteam_unlink(master_th);
1730  if (ompt_enabled.ompt_callback_parallel_end) {
1731  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1732  ompt_parallel_data, *parent_task_data,
1733  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1734  }
1735  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1736  }
1737 #endif
1738  } else if (microtask == (microtask_t)__kmp_teams_master) {
1739  KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1740  team = master_th->th.th_team;
1741  // team->t.t_pkfn = microtask;
1742  team->t.t_invoke = invoker;
1743  __kmp_alloc_argv_entries(argc, team, TRUE);
1744  team->t.t_argc = argc;
1745  argv = (void **)team->t.t_argv;
1746  if (ap) {
1747  for (i = argc - 1; i >= 0; --i)
1748  *argv++ = va_arg(kmp_va_deref(ap), void *);
1749  } else {
1750  for (i = 0; i < argc; ++i)
1751  // Get args from parent team for teams construct
1752  argv[i] = parent_team->t.t_argv[i];
1753  }
1754  // AC: revert change made in __kmpc_serialized_parallel()
1755  // because initial code in teams should have level=0
1756  team->t.t_level--;
1757  // AC: call special invoker for outer "parallel" of teams construct
1758  invoker(gtid);
1759 #if OMPT_SUPPORT
1760  if (ompt_enabled.enabled) {
1761  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1762  if (ompt_enabled.ompt_callback_implicit_task) {
1763  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1764  ompt_scope_end, NULL, &(task_info->task_data), 0,
1765  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1766  }
1767  if (ompt_enabled.ompt_callback_parallel_end) {
1768  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1769  ompt_parallel_data, *parent_task_data,
1770  OMPT_INVOKER(call_context) | ompt_parallel_league,
1771  *return_address);
1772  }
1773  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1774  }
1775 #endif
1776  } else {
1777  argv = args;
1778  for (i = argc - 1; i >= 0; --i)
1779  *argv++ = va_arg(kmp_va_deref(ap), void *);
1780  KMP_MB();
1781 
1782 #if OMPT_SUPPORT
1783  void *dummy;
1784  void **exit_frame_p;
1785  ompt_task_info_t *task_info;
1786  ompt_lw_taskteam_t lw_taskteam;
1787  ompt_data_t *implicit_task_data;
1788 
1789  if (ompt_enabled.enabled) {
1790  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1791  ompt_parallel_data, *return_address);
1792  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1793  // don't use lw_taskteam after linking. content was swaped
1794  task_info = OMPT_CUR_TASK_INFO(master_th);
1795  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1796 
1797  /* OMPT implicit task begin */
1798  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1799  if (ompt_enabled.ompt_callback_implicit_task) {
1800  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1801  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1802  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1803  ompt_task_implicit);
1804  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1805  }
1806 
1807  /* OMPT state */
1808  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1809  } else {
1810  exit_frame_p = &dummy;
1811  }
1812 #endif
1813 
1814  {
1815  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1816  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1817  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1818 #if OMPT_SUPPORT
1819  ,
1820  exit_frame_p
1821 #endif
1822  );
1823  }
1824 
1825 #if OMPT_SUPPORT
1826  if (ompt_enabled.enabled) {
1827  *exit_frame_p = NULL;
1828  if (ompt_enabled.ompt_callback_implicit_task) {
1829  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1830  ompt_scope_end, NULL, &(task_info->task_data), 1,
1831  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1832  }
1833 
1834  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1835  __ompt_lw_taskteam_unlink(master_th);
1836  if (ompt_enabled.ompt_callback_parallel_end) {
1837  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1838  ompt_parallel_data, *parent_task_data,
1839  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1840  }
1841  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1842  }
1843 #endif
1844  }
1845  } else if (call_context == fork_context_gnu) {
1846 #if OMPT_SUPPORT
1847  if (ompt_enabled.enabled) {
1848  ompt_lw_taskteam_t lwt;
1849  __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1850  *return_address);
1851 
1852  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1853  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1854  }
1855 // don't use lw_taskteam after linking. content was swaped
1856 #endif
1857 
1858  // we were called from GNU native code
1859  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1860  return FALSE;
1861  } else {
1862  KMP_ASSERT2(call_context < fork_context_last,
1863  "__kmp_serial_fork_call: unknown fork_context parameter");
1864  }
1865 
1866  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1867  KMP_MB();
1868  return FALSE;
1869 }
1870 
1871 /* most of the work for a fork */
1872 /* return true if we really went parallel, false if serialized */
1873 int __kmp_fork_call(ident_t *loc, int gtid,
1874  enum fork_context_e call_context, // Intel, GNU, ...
1875  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1876  kmp_va_list ap) {
1877  void **argv;
1878  int i;
1879  int master_tid;
1880  int master_this_cons;
1881  kmp_team_t *team;
1882  kmp_team_t *parent_team;
1883  kmp_info_t *master_th;
1884  kmp_root_t *root;
1885  int nthreads;
1886  int master_active;
1887  int master_set_numthreads;
1888  int task_thread_limit = 0;
1889  int level;
1890  int active_level;
1891  int teams_level;
1892 #if KMP_NESTED_HOT_TEAMS
1893  kmp_hot_team_ptr_t **p_hot_teams;
1894 #endif
1895  { // KMP_TIME_BLOCK
1896  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1897  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1898 
1899  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1900  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1901  /* Some systems prefer the stack for the root thread(s) to start with */
1902  /* some gap from the parent stack to prevent false sharing. */
1903  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1904  /* These 2 lines below are so this does not get optimized out */
1905  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1906  __kmp_stkpadding += (short)((kmp_int64)dummy);
1907  }
1908 
1909  /* initialize if needed */
1910  KMP_DEBUG_ASSERT(
1911  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1912  if (!TCR_4(__kmp_init_parallel))
1913  __kmp_parallel_initialize();
1914  __kmp_resume_if_soft_paused();
1915 
1916  /* setup current data */
1917  // AC: potentially unsafe, not in sync with library shutdown,
1918  // __kmp_threads can be freed
1919  master_th = __kmp_threads[gtid];
1920 
1921  parent_team = master_th->th.th_team;
1922  master_tid = master_th->th.th_info.ds.ds_tid;
1923  master_this_cons = master_th->th.th_local.this_construct;
1924  root = master_th->th.th_root;
1925  master_active = root->r.r_active;
1926  master_set_numthreads = master_th->th.th_set_nproc;
1927  task_thread_limit =
1928  master_th->th.th_current_task->td_icvs.task_thread_limit;
1929 
1930 #if OMPT_SUPPORT
1931  ompt_data_t ompt_parallel_data = ompt_data_none;
1932  ompt_data_t *parent_task_data;
1933  ompt_frame_t *ompt_frame;
1934  void *return_address = NULL;
1935 
1936  if (ompt_enabled.enabled) {
1937  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1938  NULL, NULL);
1939  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1940  }
1941 #endif
1942 
1943  // Assign affinity to root thread if it hasn't happened yet
1944  __kmp_assign_root_init_mask();
1945 
1946  // Nested level will be an index in the nested nthreads array
1947  level = parent_team->t.t_level;
1948  // used to launch non-serial teams even if nested is not allowed
1949  active_level = parent_team->t.t_active_level;
1950  // needed to check nesting inside the teams
1951  teams_level = master_th->th.th_teams_level;
1952 #if KMP_NESTED_HOT_TEAMS
1953  p_hot_teams = &master_th->th.th_hot_teams;
1954  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1955  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1956  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1957  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1958  // it is either actual or not needed (when active_level > 0)
1959  (*p_hot_teams)[0].hot_team_nth = 1;
1960  }
1961 #endif
1962 
1963 #if OMPT_SUPPORT
1964  if (ompt_enabled.enabled) {
1965  if (ompt_enabled.ompt_callback_parallel_begin) {
1966  int team_size = master_set_numthreads
1967  ? master_set_numthreads
1968  : get__nproc_2(parent_team, master_tid);
1969  int flags = OMPT_INVOKER(call_context) |
1970  ((microtask == (microtask_t)__kmp_teams_master)
1971  ? ompt_parallel_league
1972  : ompt_parallel_team);
1973  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1974  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1975  return_address);
1976  }
1977  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1978  }
1979 #endif
1980 
1981  master_th->th.th_ident = loc;
1982 
1983  // Parallel closely nested in teams construct:
1984  if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1985  return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1986  call_context, microtask, invoker,
1987  master_set_numthreads, level,
1988 #if OMPT_SUPPORT
1989  ompt_parallel_data, return_address,
1990 #endif
1991  ap);
1992  } // End parallel closely nested in teams construct
1993 
1994 #if KMP_DEBUG
1995  if (__kmp_tasking_mode != tskm_immediate_exec) {
1996  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1997  parent_team->t.t_task_team[master_th->th.th_task_state]);
1998  }
1999 #endif
2000 
2001  // Need this to happen before we determine the number of threads, not while
2002  // we are allocating the team
2003  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2004 
2005  // Determine the number of threads
2006  int enter_teams =
2007  __kmp_is_entering_teams(active_level, level, teams_level, ap);
2008  if ((!enter_teams &&
2009  (parent_team->t.t_active_level >=
2010  master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2011  (__kmp_library == library_serial)) {
2012  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2013  nthreads = 1;
2014  } else {
2015  nthreads = master_set_numthreads
2016  ? master_set_numthreads
2017  // TODO: get nproc directly from current task
2018  : get__nproc_2(parent_team, master_tid);
2019  // Use the thread_limit set for the current target task if exists, else go
2020  // with the deduced nthreads
2021  nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2022  ? task_thread_limit
2023  : nthreads;
2024  // Check if we need to take forkjoin lock? (no need for serialized
2025  // parallel out of teams construct).
2026  if (nthreads > 1) {
2027  /* determine how many new threads we can use */
2028  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2029  /* AC: If we execute teams from parallel region (on host), then teams
2030  should be created but each can only have 1 thread if nesting is
2031  disabled. If teams called from serial region, then teams and their
2032  threads should be created regardless of the nesting setting. */
2033  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2034  nthreads, enter_teams);
2035  if (nthreads == 1) {
2036  // Free lock for single thread execution here; for multi-thread
2037  // execution it will be freed later after team of threads created
2038  // and initialized
2039  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2040  }
2041  }
2042  }
2043  KMP_DEBUG_ASSERT(nthreads > 0);
2044 
2045  // If we temporarily changed the set number of threads then restore it now
2046  master_th->th.th_set_nproc = 0;
2047 
2048  if (nthreads == 1) {
2049  return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2050  invoker, master_th, parent_team,
2051 #if OMPT_SUPPORT
2052  &ompt_parallel_data, &return_address,
2053  &parent_task_data,
2054 #endif
2055  ap);
2056  } // if (nthreads == 1)
2057 
2058  // GEH: only modify the executing flag in the case when not serialized
2059  // serialized case is handled in kmpc_serialized_parallel
2060  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2061  "curtask=%p, curtask_max_aclevel=%d\n",
2062  parent_team->t.t_active_level, master_th,
2063  master_th->th.th_current_task,
2064  master_th->th.th_current_task->td_icvs.max_active_levels));
2065  // TODO: GEH - cannot do this assertion because root thread not set up as
2066  // executing
2067  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2068  master_th->th.th_current_task->td_flags.executing = 0;
2069 
2070  if (!master_th->th.th_teams_microtask || level > teams_level) {
2071  /* Increment our nested depth level */
2072  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2073  }
2074 
2075  // See if we need to make a copy of the ICVs.
2076  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2077  if ((level + 1 < __kmp_nested_nth.used) &&
2078  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2079  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2080  } else {
2081  nthreads_icv = 0; // don't update
2082  }
2083 
2084  // Figure out the proc_bind_policy for the new team.
2085  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2086  // proc_bind_default means don't update
2087  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2088  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2089  proc_bind = proc_bind_false;
2090  } else {
2091  // No proc_bind clause specified; use current proc-bind-var for this
2092  // parallel region
2093  if (proc_bind == proc_bind_default) {
2094  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2095  }
2096  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2097  if (master_th->th.th_teams_microtask &&
2098  microtask == (microtask_t)__kmp_teams_master) {
2099  proc_bind = __kmp_teams_proc_bind;
2100  }
2101  /* else: The proc_bind policy was specified explicitly on parallel clause.
2102  This overrides proc-bind-var for this parallel region, but does not
2103  change proc-bind-var. */
2104  // Figure the value of proc-bind-var for the child threads.
2105  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2106  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2107  master_th->th.th_current_task->td_icvs.proc_bind)) {
2108  // Do not modify the proc bind icv for the two teams construct forks
2109  // They just let the proc bind icv pass through
2110  if (!master_th->th.th_teams_microtask ||
2111  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2112  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2113  }
2114  }
2115 
2116  // Reset for next parallel region
2117  master_th->th.th_set_proc_bind = proc_bind_default;
2118 
2119  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2120  kmp_internal_control_t new_icvs;
2121  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2122  new_icvs.next = NULL;
2123  if (nthreads_icv > 0) {
2124  new_icvs.nproc = nthreads_icv;
2125  }
2126  if (proc_bind_icv != proc_bind_default) {
2127  new_icvs.proc_bind = proc_bind_icv;
2128  }
2129 
2130  /* allocate a new parallel team */
2131  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2132  team = __kmp_allocate_team(root, nthreads, nthreads,
2133 #if OMPT_SUPPORT
2134  ompt_parallel_data,
2135 #endif
2136  proc_bind, &new_icvs,
2137  argc USE_NESTED_HOT_ARG(master_th));
2138  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2139  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2140  } else {
2141  /* allocate a new parallel team */
2142  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2143  team = __kmp_allocate_team(root, nthreads, nthreads,
2144 #if OMPT_SUPPORT
2145  ompt_parallel_data,
2146 #endif
2147  proc_bind,
2148  &master_th->th.th_current_task->td_icvs,
2149  argc USE_NESTED_HOT_ARG(master_th));
2150  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2151  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2152  &master_th->th.th_current_task->td_icvs);
2153  }
2154  KF_TRACE(
2155  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2156 
2157  /* setup the new team */
2158  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2159  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2160  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2161  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2162  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2163 #if OMPT_SUPPORT
2164  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2165  return_address);
2166 #endif
2167  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2168  // TODO: parent_team->t.t_level == INT_MAX ???
2169  if (!master_th->th.th_teams_microtask || level > teams_level) {
2170  int new_level = parent_team->t.t_level + 1;
2171  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2172  new_level = parent_team->t.t_active_level + 1;
2173  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2174  } else {
2175  // AC: Do not increase parallel level at start of the teams construct
2176  int new_level = parent_team->t.t_level;
2177  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2178  new_level = parent_team->t.t_active_level;
2179  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2180  }
2181  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2182  // set primary thread's schedule as new run-time schedule
2183  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2184 
2185  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2186  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2187 
2188  // Update the floating point rounding in the team if required.
2189  propagateFPControl(team);
2190 #if OMPD_SUPPORT
2191  if (ompd_state & OMPD_ENABLE_BP)
2192  ompd_bp_parallel_begin();
2193 #endif
2194 
2195  if (__kmp_tasking_mode != tskm_immediate_exec) {
2196  // Set primary thread's task team to team's task team. Unless this is hot
2197  // team, it should be NULL.
2198  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2199  parent_team->t.t_task_team[master_th->th.th_task_state]);
2200  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2201  "%p, new task_team %p / team %p\n",
2202  __kmp_gtid_from_thread(master_th),
2203  master_th->th.th_task_team, parent_team,
2204  team->t.t_task_team[master_th->th.th_task_state], team));
2205 
2206  if (active_level || master_th->th.th_task_team) {
2207  // Take a memo of primary thread's task_state
2208  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2209  if (master_th->th.th_task_state_top >=
2210  master_th->th.th_task_state_stack_sz) { // increase size
2211  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2212  kmp_uint8 *old_stack, *new_stack;
2213  kmp_uint32 i;
2214  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2215  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2216  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2217  }
2218  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2219  ++i) { // zero-init rest of stack
2220  new_stack[i] = 0;
2221  }
2222  old_stack = master_th->th.th_task_state_memo_stack;
2223  master_th->th.th_task_state_memo_stack = new_stack;
2224  master_th->th.th_task_state_stack_sz = new_size;
2225  __kmp_free(old_stack);
2226  }
2227  // Store primary thread's task_state on stack
2228  master_th->th
2229  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2230  master_th->th.th_task_state;
2231  master_th->th.th_task_state_top++;
2232 #if KMP_NESTED_HOT_TEAMS
2233  if (master_th->th.th_hot_teams &&
2234  active_level < __kmp_hot_teams_max_level &&
2235  team == master_th->th.th_hot_teams[active_level].hot_team) {
2236  // Restore primary thread's nested state if nested hot team
2237  master_th->th.th_task_state =
2238  master_th->th
2239  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2240  } else {
2241 #endif
2242  master_th->th.th_task_state = 0;
2243 #if KMP_NESTED_HOT_TEAMS
2244  }
2245 #endif
2246  }
2247 #if !KMP_NESTED_HOT_TEAMS
2248  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2249  (team == root->r.r_hot_team));
2250 #endif
2251  }
2252 
2253  KA_TRACE(
2254  20,
2255  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2256  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2257  team->t.t_nproc));
2258  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2259  (team->t.t_master_tid == 0 &&
2260  (team->t.t_parent == root->r.r_root_team ||
2261  team->t.t_parent->t.t_serialized)));
2262  KMP_MB();
2263 
2264  /* now, setup the arguments */
2265  argv = (void **)team->t.t_argv;
2266  if (ap) {
2267  for (i = argc - 1; i >= 0; --i) {
2268  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2269  KMP_CHECK_UPDATE(*argv, new_argv);
2270  argv++;
2271  }
2272  } else {
2273  for (i = 0; i < argc; ++i) {
2274  // Get args from parent team for teams construct
2275  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2276  }
2277  }
2278 
2279  /* now actually fork the threads */
2280  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2281  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2282  root->r.r_active = TRUE;
2283 
2284  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2285  __kmp_setup_icv_copy(team, nthreads,
2286  &master_th->th.th_current_task->td_icvs, loc);
2287 
2288 #if OMPT_SUPPORT
2289  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2290 #endif
2291 
2292  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2293 
2294 #if USE_ITT_BUILD
2295  if (team->t.t_active_level == 1 // only report frames at level 1
2296  && !master_th->th.th_teams_microtask) { // not in teams construct
2297 #if USE_ITT_NOTIFY
2298  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2299  (__kmp_forkjoin_frames_mode == 3 ||
2300  __kmp_forkjoin_frames_mode == 1)) {
2301  kmp_uint64 tmp_time = 0;
2302  if (__itt_get_timestamp_ptr)
2303  tmp_time = __itt_get_timestamp();
2304  // Internal fork - report frame begin
2305  master_th->th.th_frame_time = tmp_time;
2306  if (__kmp_forkjoin_frames_mode == 3)
2307  team->t.t_region_time = tmp_time;
2308  } else
2309 // only one notification scheme (either "submit" or "forking/joined", not both)
2310 #endif /* USE_ITT_NOTIFY */
2311  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2312  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2313  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2314  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2315  }
2316  }
2317 #endif /* USE_ITT_BUILD */
2318 
2319  /* now go on and do the work */
2320  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2321  KMP_MB();
2322  KF_TRACE(10,
2323  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2324  root, team, master_th, gtid));
2325 
2326 #if USE_ITT_BUILD
2327  if (__itt_stack_caller_create_ptr) {
2328  // create new stack stitching id before entering fork barrier
2329  if (!enter_teams) {
2330  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2331  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2332  } else if (parent_team->t.t_serialized) {
2333  // keep stack stitching id in the serialized parent_team;
2334  // current team will be used for parallel inside the teams;
2335  // if parent_team is active, then it already keeps stack stitching id
2336  // for the league of teams
2337  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2338  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2339  }
2340  }
2341 #endif /* USE_ITT_BUILD */
2342 
2343  // AC: skip __kmp_internal_fork at teams construct, let only primary
2344  // threads execute
2345  if (ap) {
2346  __kmp_internal_fork(loc, gtid, team);
2347  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2348  "master_th=%p, gtid=%d\n",
2349  root, team, master_th, gtid));
2350  }
2351 
2352  if (call_context == fork_context_gnu) {
2353  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2354  return TRUE;
2355  }
2356 
2357  /* Invoke microtask for PRIMARY thread */
2358  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2359  team->t.t_id, team->t.t_pkfn));
2360  } // END of timer KMP_fork_call block
2361 
2362 #if KMP_STATS_ENABLED
2363  // If beginning a teams construct, then change thread state
2364  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2365  if (!ap) {
2366  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2367  }
2368 #endif
2369 
2370  if (!team->t.t_invoke(gtid)) {
2371  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2372  }
2373 
2374 #if KMP_STATS_ENABLED
2375  // If was beginning of a teams construct, then reset thread state
2376  if (!ap) {
2377  KMP_SET_THREAD_STATE(previous_state);
2378  }
2379 #endif
2380 
2381  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2382  team->t.t_id, team->t.t_pkfn));
2383  KMP_MB(); /* Flush all pending memory write invalidates. */
2384 
2385  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2386 #if OMPT_SUPPORT
2387  if (ompt_enabled.enabled) {
2388  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2389  }
2390 #endif
2391 
2392  return TRUE;
2393 }
2394 
2395 #if OMPT_SUPPORT
2396 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2397  kmp_team_t *team) {
2398  // restore state outside the region
2399  thread->th.ompt_thread_info.state =
2400  ((team->t.t_serialized) ? ompt_state_work_serial
2401  : ompt_state_work_parallel);
2402 }
2403 
2404 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2405  kmp_team_t *team, ompt_data_t *parallel_data,
2406  int flags, void *codeptr) {
2407  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2408  if (ompt_enabled.ompt_callback_parallel_end) {
2409  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2410  parallel_data, &(task_info->task_data), flags, codeptr);
2411  }
2412 
2413  task_info->frame.enter_frame = ompt_data_none;
2414  __kmp_join_restore_state(thread, team);
2415 }
2416 #endif
2417 
2418 void __kmp_join_call(ident_t *loc, int gtid
2419 #if OMPT_SUPPORT
2420  ,
2421  enum fork_context_e fork_context
2422 #endif
2423  ,
2424  int exit_teams) {
2425  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2426  kmp_team_t *team;
2427  kmp_team_t *parent_team;
2428  kmp_info_t *master_th;
2429  kmp_root_t *root;
2430  int master_active;
2431 
2432  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2433 
2434  /* setup current data */
2435  master_th = __kmp_threads[gtid];
2436  root = master_th->th.th_root;
2437  team = master_th->th.th_team;
2438  parent_team = team->t.t_parent;
2439 
2440  master_th->th.th_ident = loc;
2441 
2442 #if OMPT_SUPPORT
2443  void *team_microtask = (void *)team->t.t_pkfn;
2444  // For GOMP interface with serialized parallel, need the
2445  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2446  // and end-parallel events.
2447  if (ompt_enabled.enabled &&
2448  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2449  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2450  }
2451 #endif
2452 
2453 #if KMP_DEBUG
2454  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2455  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2456  "th_task_team = %p\n",
2457  __kmp_gtid_from_thread(master_th), team,
2458  team->t.t_task_team[master_th->th.th_task_state],
2459  master_th->th.th_task_team));
2460  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2461  team->t.t_task_team[master_th->th.th_task_state]);
2462  }
2463 #endif
2464 
2465  if (team->t.t_serialized) {
2466  if (master_th->th.th_teams_microtask) {
2467  // We are in teams construct
2468  int level = team->t.t_level;
2469  int tlevel = master_th->th.th_teams_level;
2470  if (level == tlevel) {
2471  // AC: we haven't incremented it earlier at start of teams construct,
2472  // so do it here - at the end of teams construct
2473  team->t.t_level++;
2474  } else if (level == tlevel + 1) {
2475  // AC: we are exiting parallel inside teams, need to increment
2476  // serialization in order to restore it in the next call to
2477  // __kmpc_end_serialized_parallel
2478  team->t.t_serialized++;
2479  }
2480  }
2481  __kmpc_end_serialized_parallel(loc, gtid);
2482 
2483 #if OMPT_SUPPORT
2484  if (ompt_enabled.enabled) {
2485  if (fork_context == fork_context_gnu) {
2486  __ompt_lw_taskteam_unlink(master_th);
2487  }
2488  __kmp_join_restore_state(master_th, parent_team);
2489  }
2490 #endif
2491 
2492  return;
2493  }
2494 
2495  master_active = team->t.t_master_active;
2496 
2497  if (!exit_teams) {
2498  // AC: No barrier for internal teams at exit from teams construct.
2499  // But there is barrier for external team (league).
2500  __kmp_internal_join(loc, gtid, team);
2501 #if USE_ITT_BUILD
2502  if (__itt_stack_caller_create_ptr) {
2503  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2504  // destroy the stack stitching id after join barrier
2505  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2506  team->t.t_stack_id = NULL;
2507  }
2508 #endif
2509  } else {
2510  master_th->th.th_task_state =
2511  0; // AC: no tasking in teams (out of any parallel)
2512 #if USE_ITT_BUILD
2513  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2514  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2515  // destroy the stack stitching id on exit from the teams construct
2516  // if parent_team is active, then the id will be destroyed later on
2517  // by master of the league of teams
2518  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2519  parent_team->t.t_stack_id = NULL;
2520  }
2521 #endif
2522  }
2523 
2524  KMP_MB();
2525 
2526 #if OMPT_SUPPORT
2527  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2528  void *codeptr = team->t.ompt_team_info.master_return_address;
2529 #endif
2530 
2531 #if USE_ITT_BUILD
2532  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2533  if (team->t.t_active_level == 1 &&
2534  (!master_th->th.th_teams_microtask || /* not in teams construct */
2535  master_th->th.th_teams_size.nteams == 1)) {
2536  master_th->th.th_ident = loc;
2537  // only one notification scheme (either "submit" or "forking/joined", not
2538  // both)
2539  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2540  __kmp_forkjoin_frames_mode == 3)
2541  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2542  master_th->th.th_frame_time, 0, loc,
2543  master_th->th.th_team_nproc, 1);
2544  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2545  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2546  __kmp_itt_region_joined(gtid);
2547  } // active_level == 1
2548 #endif /* USE_ITT_BUILD */
2549 
2550 #if KMP_AFFINITY_SUPPORTED
2551  if (!exit_teams) {
2552  // Restore master thread's partition.
2553  master_th->th.th_first_place = team->t.t_first_place;
2554  master_th->th.th_last_place = team->t.t_last_place;
2555  }
2556 #endif // KMP_AFFINITY_SUPPORTED
2557 
2558  if (master_th->th.th_teams_microtask && !exit_teams &&
2559  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2560  team->t.t_level == master_th->th.th_teams_level + 1) {
2561 // AC: We need to leave the team structure intact at the end of parallel
2562 // inside the teams construct, so that at the next parallel same (hot) team
2563 // works, only adjust nesting levels
2564 #if OMPT_SUPPORT
2565  ompt_data_t ompt_parallel_data = ompt_data_none;
2566  if (ompt_enabled.enabled) {
2567  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2568  if (ompt_enabled.ompt_callback_implicit_task) {
2569  int ompt_team_size = team->t.t_nproc;
2570  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2571  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2572  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2573  }
2574  task_info->frame.exit_frame = ompt_data_none;
2575  task_info->task_data = ompt_data_none;
2576  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2577  __ompt_lw_taskteam_unlink(master_th);
2578  }
2579 #endif
2580  /* Decrement our nested depth level */
2581  team->t.t_level--;
2582  team->t.t_active_level--;
2583  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2584 
2585  // Restore number of threads in the team if needed. This code relies on
2586  // the proper adjustment of th_teams_size.nth after the fork in
2587  // __kmp_teams_master on each teams primary thread in the case that
2588  // __kmp_reserve_threads reduced it.
2589  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2590  int old_num = master_th->th.th_team_nproc;
2591  int new_num = master_th->th.th_teams_size.nth;
2592  kmp_info_t **other_threads = team->t.t_threads;
2593  team->t.t_nproc = new_num;
2594  for (int i = 0; i < old_num; ++i) {
2595  other_threads[i]->th.th_team_nproc = new_num;
2596  }
2597  // Adjust states of non-used threads of the team
2598  for (int i = old_num; i < new_num; ++i) {
2599  // Re-initialize thread's barrier data.
2600  KMP_DEBUG_ASSERT(other_threads[i]);
2601  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2602  for (int b = 0; b < bs_last_barrier; ++b) {
2603  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2604  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2605 #if USE_DEBUGGER
2606  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2607 #endif
2608  }
2609  if (__kmp_tasking_mode != tskm_immediate_exec) {
2610  // Synchronize thread's task state
2611  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2612  }
2613  }
2614  }
2615 
2616 #if OMPT_SUPPORT
2617  if (ompt_enabled.enabled) {
2618  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2619  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2620  }
2621 #endif
2622 
2623  return;
2624  }
2625 
2626  /* do cleanup and restore the parent team */
2627  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2628  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2629 
2630  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2631 
2632  /* jc: The following lock has instructions with REL and ACQ semantics,
2633  separating the parallel user code called in this parallel region
2634  from the serial user code called after this function returns. */
2635  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2636 
2637  if (!master_th->th.th_teams_microtask ||
2638  team->t.t_level > master_th->th.th_teams_level) {
2639  /* Decrement our nested depth level */
2640  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2641  }
2642  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2643 
2644 #if OMPT_SUPPORT
2645  if (ompt_enabled.enabled) {
2646  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2647  if (ompt_enabled.ompt_callback_implicit_task) {
2648  int flags = (team_microtask == (void *)__kmp_teams_master)
2649  ? ompt_task_initial
2650  : ompt_task_implicit;
2651  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2652  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2653  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2654  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2655  }
2656  task_info->frame.exit_frame = ompt_data_none;
2657  task_info->task_data = ompt_data_none;
2658  }
2659 #endif
2660 
2661  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2662  master_th, team));
2663  __kmp_pop_current_task_from_thread(master_th);
2664 
2665  master_th->th.th_def_allocator = team->t.t_def_allocator;
2666 
2667 #if OMPD_SUPPORT
2668  if (ompd_state & OMPD_ENABLE_BP)
2669  ompd_bp_parallel_end();
2670 #endif
2671  updateHWFPControl(team);
2672 
2673  if (root->r.r_active != master_active)
2674  root->r.r_active = master_active;
2675 
2676  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2677  master_th)); // this will free worker threads
2678 
2679  /* this race was fun to find. make sure the following is in the critical
2680  region otherwise assertions may fail occasionally since the old team may be
2681  reallocated and the hierarchy appears inconsistent. it is actually safe to
2682  run and won't cause any bugs, but will cause those assertion failures. it's
2683  only one deref&assign so might as well put this in the critical region */
2684  master_th->th.th_team = parent_team;
2685  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2686  master_th->th.th_team_master = parent_team->t.t_threads[0];
2687  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2688 
2689  /* restore serialized team, if need be */
2690  if (parent_team->t.t_serialized &&
2691  parent_team != master_th->th.th_serial_team &&
2692  parent_team != root->r.r_root_team) {
2693  __kmp_free_team(root,
2694  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2695  master_th->th.th_serial_team = parent_team;
2696  }
2697 
2698  if (__kmp_tasking_mode != tskm_immediate_exec) {
2699  if (master_th->th.th_task_state_top >
2700  0) { // Restore task state from memo stack
2701  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2702  // Remember primary thread's state if we re-use this nested hot team
2703  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2704  master_th->th.th_task_state;
2705  --master_th->th.th_task_state_top; // pop
2706  // Now restore state at this level
2707  master_th->th.th_task_state =
2708  master_th->th
2709  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2710  } else if (team != root->r.r_hot_team) {
2711  // Reset the task state of primary thread if we are not hot team because
2712  // in this case all the worker threads will be free, and their task state
2713  // will be reset. If not reset the primary's, the task state will be
2714  // inconsistent.
2715  master_th->th.th_task_state = 0;
2716  }
2717  // Copy the task team from the parent team to the primary thread
2718  master_th->th.th_task_team =
2719  parent_team->t.t_task_team[master_th->th.th_task_state];
2720  KA_TRACE(20,
2721  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2722  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2723  parent_team));
2724  }
2725 
2726  // TODO: GEH - cannot do this assertion because root thread not set up as
2727  // executing
2728  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2729  master_th->th.th_current_task->td_flags.executing = 1;
2730 
2731  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2732 
2733 #if KMP_AFFINITY_SUPPORTED
2734  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2735  __kmp_reset_root_init_mask(gtid);
2736  }
2737 #endif
2738 #if OMPT_SUPPORT
2739  int flags =
2740  OMPT_INVOKER(fork_context) |
2741  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2742  : ompt_parallel_team);
2743  if (ompt_enabled.enabled) {
2744  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2745  codeptr);
2746  }
2747 #endif
2748 
2749  KMP_MB();
2750  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2751 }
2752 
2753 /* Check whether we should push an internal control record onto the
2754  serial team stack. If so, do it. */
2755 void __kmp_save_internal_controls(kmp_info_t *thread) {
2756 
2757  if (thread->th.th_team != thread->th.th_serial_team) {
2758  return;
2759  }
2760  if (thread->th.th_team->t.t_serialized > 1) {
2761  int push = 0;
2762 
2763  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2764  push = 1;
2765  } else {
2766  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2767  thread->th.th_team->t.t_serialized) {
2768  push = 1;
2769  }
2770  }
2771  if (push) { /* push a record on the serial team's stack */
2772  kmp_internal_control_t *control =
2773  (kmp_internal_control_t *)__kmp_allocate(
2774  sizeof(kmp_internal_control_t));
2775 
2776  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2777 
2778  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2779 
2780  control->next = thread->th.th_team->t.t_control_stack_top;
2781  thread->th.th_team->t.t_control_stack_top = control;
2782  }
2783  }
2784 }
2785 
2786 /* Changes set_nproc */
2787 void __kmp_set_num_threads(int new_nth, int gtid) {
2788  kmp_info_t *thread;
2789  kmp_root_t *root;
2790 
2791  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2792  KMP_DEBUG_ASSERT(__kmp_init_serial);
2793 
2794  if (new_nth < 1)
2795  new_nth = 1;
2796  else if (new_nth > __kmp_max_nth)
2797  new_nth = __kmp_max_nth;
2798 
2799  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2800  thread = __kmp_threads[gtid];
2801  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2802  return; // nothing to do
2803 
2804  __kmp_save_internal_controls(thread);
2805 
2806  set__nproc(thread, new_nth);
2807 
2808  // If this omp_set_num_threads() call will cause the hot team size to be
2809  // reduced (in the absence of a num_threads clause), then reduce it now,
2810  // rather than waiting for the next parallel region.
2811  root = thread->th.th_root;
2812  if (__kmp_init_parallel && (!root->r.r_active) &&
2813  (root->r.r_hot_team->t.t_nproc > new_nth)
2814 #if KMP_NESTED_HOT_TEAMS
2815  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2816 #endif
2817  ) {
2818  kmp_team_t *hot_team = root->r.r_hot_team;
2819  int f;
2820 
2821  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2822 
2823  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2824  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2825  }
2826  // Release the extra threads we don't need any more.
2827  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2828  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2829  if (__kmp_tasking_mode != tskm_immediate_exec) {
2830  // When decreasing team size, threads no longer in the team should unref
2831  // task team.
2832  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2833  }
2834  __kmp_free_thread(hot_team->t.t_threads[f]);
2835  hot_team->t.t_threads[f] = NULL;
2836  }
2837  hot_team->t.t_nproc = new_nth;
2838 #if KMP_NESTED_HOT_TEAMS
2839  if (thread->th.th_hot_teams) {
2840  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2841  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2842  }
2843 #endif
2844 
2845  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2846  hot_team->t.b->update_num_threads(new_nth);
2847  __kmp_add_threads_to_team(hot_team, new_nth);
2848  }
2849 
2850  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2851 
2852  // Update the t_nproc field in the threads that are still active.
2853  for (f = 0; f < new_nth; f++) {
2854  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2855  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2856  }
2857  // Special flag in case omp_set_num_threads() call
2858  hot_team->t.t_size_changed = -1;
2859  }
2860 }
2861 
2862 /* Changes max_active_levels */
2863 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2864  kmp_info_t *thread;
2865 
2866  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2867  "%d = (%d)\n",
2868  gtid, max_active_levels));
2869  KMP_DEBUG_ASSERT(__kmp_init_serial);
2870 
2871  // validate max_active_levels
2872  if (max_active_levels < 0) {
2873  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2874  // We ignore this call if the user has specified a negative value.
2875  // The current setting won't be changed. The last valid setting will be
2876  // used. A warning will be issued (if warnings are allowed as controlled by
2877  // the KMP_WARNINGS env var).
2878  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2879  "max_active_levels for thread %d = (%d)\n",
2880  gtid, max_active_levels));
2881  return;
2882  }
2883  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2884  // it's OK, the max_active_levels is within the valid range: [ 0;
2885  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2886  // We allow a zero value. (implementation defined behavior)
2887  } else {
2888  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2889  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2890  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2891  // Current upper limit is MAX_INT. (implementation defined behavior)
2892  // If the input exceeds the upper limit, we correct the input to be the
2893  // upper limit. (implementation defined behavior)
2894  // Actually, the flow should never get here until we use MAX_INT limit.
2895  }
2896  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2897  "max_active_levels for thread %d = (%d)\n",
2898  gtid, max_active_levels));
2899 
2900  thread = __kmp_threads[gtid];
2901 
2902  __kmp_save_internal_controls(thread);
2903 
2904  set__max_active_levels(thread, max_active_levels);
2905 }
2906 
2907 /* Gets max_active_levels */
2908 int __kmp_get_max_active_levels(int gtid) {
2909  kmp_info_t *thread;
2910 
2911  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2912  KMP_DEBUG_ASSERT(__kmp_init_serial);
2913 
2914  thread = __kmp_threads[gtid];
2915  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2916  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2917  "curtask_maxaclevel=%d\n",
2918  gtid, thread->th.th_current_task,
2919  thread->th.th_current_task->td_icvs.max_active_levels));
2920  return thread->th.th_current_task->td_icvs.max_active_levels;
2921 }
2922 
2923 // nteams-var per-device ICV
2924 void __kmp_set_num_teams(int num_teams) {
2925  if (num_teams > 0)
2926  __kmp_nteams = num_teams;
2927 }
2928 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2929 // teams-thread-limit-var per-device ICV
2930 void __kmp_set_teams_thread_limit(int limit) {
2931  if (limit > 0)
2932  __kmp_teams_thread_limit = limit;
2933 }
2934 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2935 
2936 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2937 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2938 
2939 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2940 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2941  kmp_info_t *thread;
2942  kmp_sched_t orig_kind;
2943  // kmp_team_t *team;
2944 
2945  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2946  gtid, (int)kind, chunk));
2947  KMP_DEBUG_ASSERT(__kmp_init_serial);
2948 
2949  // Check if the kind parameter is valid, correct if needed.
2950  // Valid parameters should fit in one of two intervals - standard or extended:
2951  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2952  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2953  orig_kind = kind;
2954  kind = __kmp_sched_without_mods(kind);
2955 
2956  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2957  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2958  // TODO: Hint needs attention in case we change the default schedule.
2959  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2960  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2961  __kmp_msg_null);
2962  kind = kmp_sched_default;
2963  chunk = 0; // ignore chunk value in case of bad kind
2964  }
2965 
2966  thread = __kmp_threads[gtid];
2967 
2968  __kmp_save_internal_controls(thread);
2969 
2970  if (kind < kmp_sched_upper_std) {
2971  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2972  // differ static chunked vs. unchunked: chunk should be invalid to
2973  // indicate unchunked schedule (which is the default)
2974  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2975  } else {
2976  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2977  __kmp_sch_map[kind - kmp_sched_lower - 1];
2978  }
2979  } else {
2980  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2981  // kmp_sched_lower - 2 ];
2982  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2983  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2984  kmp_sched_lower - 2];
2985  }
2986  __kmp_sched_apply_mods_intkind(
2987  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2988  if (kind == kmp_sched_auto || chunk < 1) {
2989  // ignore parameter chunk for schedule auto
2990  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2991  } else {
2992  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2993  }
2994 }
2995 
2996 /* Gets def_sched_var ICV values */
2997 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2998  kmp_info_t *thread;
2999  enum sched_type th_type;
3000 
3001  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3002  KMP_DEBUG_ASSERT(__kmp_init_serial);
3003 
3004  thread = __kmp_threads[gtid];
3005 
3006  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3007  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3008  case kmp_sch_static:
3009  case kmp_sch_static_greedy:
3010  case kmp_sch_static_balanced:
3011  *kind = kmp_sched_static;
3012  __kmp_sched_apply_mods_stdkind(kind, th_type);
3013  *chunk = 0; // chunk was not set, try to show this fact via zero value
3014  return;
3015  case kmp_sch_static_chunked:
3016  *kind = kmp_sched_static;
3017  break;
3018  case kmp_sch_dynamic_chunked:
3019  *kind = kmp_sched_dynamic;
3020  break;
3022  case kmp_sch_guided_iterative_chunked:
3023  case kmp_sch_guided_analytical_chunked:
3024  *kind = kmp_sched_guided;
3025  break;
3026  case kmp_sch_auto:
3027  *kind = kmp_sched_auto;
3028  break;
3029  case kmp_sch_trapezoidal:
3030  *kind = kmp_sched_trapezoidal;
3031  break;
3032 #if KMP_STATIC_STEAL_ENABLED
3033  case kmp_sch_static_steal:
3034  *kind = kmp_sched_static_steal;
3035  break;
3036 #endif
3037  default:
3038  KMP_FATAL(UnknownSchedulingType, th_type);
3039  }
3040 
3041  __kmp_sched_apply_mods_stdkind(kind, th_type);
3042  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3043 }
3044 
3045 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3046 
3047  int ii, dd;
3048  kmp_team_t *team;
3049  kmp_info_t *thr;
3050 
3051  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3052  KMP_DEBUG_ASSERT(__kmp_init_serial);
3053 
3054  // validate level
3055  if (level == 0)
3056  return 0;
3057  if (level < 0)
3058  return -1;
3059  thr = __kmp_threads[gtid];
3060  team = thr->th.th_team;
3061  ii = team->t.t_level;
3062  if (level > ii)
3063  return -1;
3064 
3065  if (thr->th.th_teams_microtask) {
3066  // AC: we are in teams region where multiple nested teams have same level
3067  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3068  if (level <=
3069  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3070  KMP_DEBUG_ASSERT(ii >= tlevel);
3071  // AC: As we need to pass by the teams league, we need to artificially
3072  // increase ii
3073  if (ii == tlevel) {
3074  ii += 2; // three teams have same level
3075  } else {
3076  ii++; // two teams have same level
3077  }
3078  }
3079  }
3080 
3081  if (ii == level)
3082  return __kmp_tid_from_gtid(gtid);
3083 
3084  dd = team->t.t_serialized;
3085  level++;
3086  while (ii > level) {
3087  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3088  }
3089  if ((team->t.t_serialized) && (!dd)) {
3090  team = team->t.t_parent;
3091  continue;
3092  }
3093  if (ii > level) {
3094  team = team->t.t_parent;
3095  dd = team->t.t_serialized;
3096  ii--;
3097  }
3098  }
3099 
3100  return (dd > 1) ? (0) : (team->t.t_master_tid);
3101 }
3102 
3103 int __kmp_get_team_size(int gtid, int level) {
3104 
3105  int ii, dd;
3106  kmp_team_t *team;
3107  kmp_info_t *thr;
3108 
3109  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3110  KMP_DEBUG_ASSERT(__kmp_init_serial);
3111 
3112  // validate level
3113  if (level == 0)
3114  return 1;
3115  if (level < 0)
3116  return -1;
3117  thr = __kmp_threads[gtid];
3118  team = thr->th.th_team;
3119  ii = team->t.t_level;
3120  if (level > ii)
3121  return -1;
3122 
3123  if (thr->th.th_teams_microtask) {
3124  // AC: we are in teams region where multiple nested teams have same level
3125  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3126  if (level <=
3127  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3128  KMP_DEBUG_ASSERT(ii >= tlevel);
3129  // AC: As we need to pass by the teams league, we need to artificially
3130  // increase ii
3131  if (ii == tlevel) {
3132  ii += 2; // three teams have same level
3133  } else {
3134  ii++; // two teams have same level
3135  }
3136  }
3137  }
3138 
3139  while (ii > level) {
3140  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3141  }
3142  if (team->t.t_serialized && (!dd)) {
3143  team = team->t.t_parent;
3144  continue;
3145  }
3146  if (ii > level) {
3147  team = team->t.t_parent;
3148  ii--;
3149  }
3150  }
3151 
3152  return team->t.t_nproc;
3153 }
3154 
3155 kmp_r_sched_t __kmp_get_schedule_global() {
3156  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3157  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3158  // independently. So one can get the updated schedule here.
3159 
3160  kmp_r_sched_t r_sched;
3161 
3162  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3163  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3164  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3165  // different roots (even in OMP 2.5)
3166  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3167  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3168  if (s == kmp_sch_static) {
3169  // replace STATIC with more detailed schedule (balanced or greedy)
3170  r_sched.r_sched_type = __kmp_static;
3171  } else if (s == kmp_sch_guided_chunked) {
3172  // replace GUIDED with more detailed schedule (iterative or analytical)
3173  r_sched.r_sched_type = __kmp_guided;
3174  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3175  r_sched.r_sched_type = __kmp_sched;
3176  }
3177  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3178 
3179  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3180  // __kmp_chunk may be wrong here (if it was not ever set)
3181  r_sched.chunk = KMP_DEFAULT_CHUNK;
3182  } else {
3183  r_sched.chunk = __kmp_chunk;
3184  }
3185 
3186  return r_sched;
3187 }
3188 
3189 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3190  at least argc number of *t_argv entries for the requested team. */
3191 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3192 
3193  KMP_DEBUG_ASSERT(team);
3194  if (!realloc || argc > team->t.t_max_argc) {
3195 
3196  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3197  "current entries=%d\n",
3198  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3199  /* if previously allocated heap space for args, free them */
3200  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3201  __kmp_free((void *)team->t.t_argv);
3202 
3203  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3204  /* use unused space in the cache line for arguments */
3205  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3206  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3207  "argv entries\n",
3208  team->t.t_id, team->t.t_max_argc));
3209  team->t.t_argv = &team->t.t_inline_argv[0];
3210  if (__kmp_storage_map) {
3211  __kmp_print_storage_map_gtid(
3212  -1, &team->t.t_inline_argv[0],
3213  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3214  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3215  team->t.t_id);
3216  }
3217  } else {
3218  /* allocate space for arguments in the heap */
3219  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3220  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3221  : 2 * argc;
3222  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3223  "argv entries\n",
3224  team->t.t_id, team->t.t_max_argc));
3225  team->t.t_argv =
3226  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3227  if (__kmp_storage_map) {
3228  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3229  &team->t.t_argv[team->t.t_max_argc],
3230  sizeof(void *) * team->t.t_max_argc,
3231  "team_%d.t_argv", team->t.t_id);
3232  }
3233  }
3234  }
3235 }
3236 
3237 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3238  int i;
3239  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3240  team->t.t_threads =
3241  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3242  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3243  sizeof(dispatch_shared_info_t) * num_disp_buff);
3244  team->t.t_dispatch =
3245  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3246  team->t.t_implicit_task_taskdata =
3247  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3248  team->t.t_max_nproc = max_nth;
3249 
3250  /* setup dispatch buffers */
3251  for (i = 0; i < num_disp_buff; ++i) {
3252  team->t.t_disp_buffer[i].buffer_index = i;
3253  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3254  }
3255 }
3256 
3257 static void __kmp_free_team_arrays(kmp_team_t *team) {
3258  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3259  int i;
3260  for (i = 0; i < team->t.t_max_nproc; ++i) {
3261  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3262  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3263  team->t.t_dispatch[i].th_disp_buffer = NULL;
3264  }
3265  }
3266 #if KMP_USE_HIER_SCHED
3267  __kmp_dispatch_free_hierarchies(team);
3268 #endif
3269  __kmp_free(team->t.t_threads);
3270  __kmp_free(team->t.t_disp_buffer);
3271  __kmp_free(team->t.t_dispatch);
3272  __kmp_free(team->t.t_implicit_task_taskdata);
3273  team->t.t_threads = NULL;
3274  team->t.t_disp_buffer = NULL;
3275  team->t.t_dispatch = NULL;
3276  team->t.t_implicit_task_taskdata = 0;
3277 }
3278 
3279 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3280  kmp_info_t **oldThreads = team->t.t_threads;
3281 
3282  __kmp_free(team->t.t_disp_buffer);
3283  __kmp_free(team->t.t_dispatch);
3284  __kmp_free(team->t.t_implicit_task_taskdata);
3285  __kmp_allocate_team_arrays(team, max_nth);
3286 
3287  KMP_MEMCPY(team->t.t_threads, oldThreads,
3288  team->t.t_nproc * sizeof(kmp_info_t *));
3289 
3290  __kmp_free(oldThreads);
3291 }
3292 
3293 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3294 
3295  kmp_r_sched_t r_sched =
3296  __kmp_get_schedule_global(); // get current state of scheduling globals
3297 
3298  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3299 
3300  kmp_internal_control_t g_icvs = {
3301  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3302  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3303  // adjustment of threads (per thread)
3304  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3305  // whether blocktime is explicitly set
3306  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3307 #if KMP_USE_MONITOR
3308  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3309 // intervals
3310 #endif
3311  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3312  // next parallel region (per thread)
3313  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3314  __kmp_cg_max_nth, // int thread_limit;
3315  __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3316  // on task. This is used in the case of target thread_limit
3317  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3318  // for max_active_levels
3319  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3320  // {sched,chunk} pair
3321  __kmp_nested_proc_bind.bind_types[0],
3322  __kmp_default_device,
3323  NULL // struct kmp_internal_control *next;
3324  };
3325 
3326  return g_icvs;
3327 }
3328 
3329 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3330 
3331  kmp_internal_control_t gx_icvs;
3332  gx_icvs.serial_nesting_level =
3333  0; // probably =team->t.t_serial like in save_inter_controls
3334  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3335  gx_icvs.next = NULL;
3336 
3337  return gx_icvs;
3338 }
3339 
3340 static void __kmp_initialize_root(kmp_root_t *root) {
3341  int f;
3342  kmp_team_t *root_team;
3343  kmp_team_t *hot_team;
3344  int hot_team_max_nth;
3345  kmp_r_sched_t r_sched =
3346  __kmp_get_schedule_global(); // get current state of scheduling globals
3347  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3348  KMP_DEBUG_ASSERT(root);
3349  KMP_ASSERT(!root->r.r_begin);
3350 
3351  /* setup the root state structure */
3352  __kmp_init_lock(&root->r.r_begin_lock);
3353  root->r.r_begin = FALSE;
3354  root->r.r_active = FALSE;
3355  root->r.r_in_parallel = 0;
3356  root->r.r_blocktime = __kmp_dflt_blocktime;
3357 #if KMP_AFFINITY_SUPPORTED
3358  root->r.r_affinity_assigned = FALSE;
3359 #endif
3360 
3361  /* setup the root team for this task */
3362  /* allocate the root team structure */
3363  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3364 
3365  root_team =
3366  __kmp_allocate_team(root,
3367  1, // new_nproc
3368  1, // max_nproc
3369 #if OMPT_SUPPORT
3370  ompt_data_none, // root parallel id
3371 #endif
3372  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3373  0 // argc
3374  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3375  );
3376 #if USE_DEBUGGER
3377  // Non-NULL value should be assigned to make the debugger display the root
3378  // team.
3379  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3380 #endif
3381 
3382  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3383 
3384  root->r.r_root_team = root_team;
3385  root_team->t.t_control_stack_top = NULL;
3386 
3387  /* initialize root team */
3388  root_team->t.t_threads[0] = NULL;
3389  root_team->t.t_nproc = 1;
3390  root_team->t.t_serialized = 1;
3391  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3392  root_team->t.t_sched.sched = r_sched.sched;
3393  KA_TRACE(
3394  20,
3395  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3396  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3397 
3398  /* setup the hot team for this task */
3399  /* allocate the hot team structure */
3400  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3401 
3402  hot_team =
3403  __kmp_allocate_team(root,
3404  1, // new_nproc
3405  __kmp_dflt_team_nth_ub * 2, // max_nproc
3406 #if OMPT_SUPPORT
3407  ompt_data_none, // root parallel id
3408 #endif
3409  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3410  0 // argc
3411  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3412  );
3413  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3414 
3415  root->r.r_hot_team = hot_team;
3416  root_team->t.t_control_stack_top = NULL;
3417 
3418  /* first-time initialization */
3419  hot_team->t.t_parent = root_team;
3420 
3421  /* initialize hot team */
3422  hot_team_max_nth = hot_team->t.t_max_nproc;
3423  for (f = 0; f < hot_team_max_nth; ++f) {
3424  hot_team->t.t_threads[f] = NULL;
3425  }
3426  hot_team->t.t_nproc = 1;
3427  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3428  hot_team->t.t_sched.sched = r_sched.sched;
3429  hot_team->t.t_size_changed = 0;
3430 }
3431 
3432 #ifdef KMP_DEBUG
3433 
3434 typedef struct kmp_team_list_item {
3435  kmp_team_p const *entry;
3436  struct kmp_team_list_item *next;
3437 } kmp_team_list_item_t;
3438 typedef kmp_team_list_item_t *kmp_team_list_t;
3439 
3440 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3441  kmp_team_list_t list, // List of teams.
3442  kmp_team_p const *team // Team to add.
3443 ) {
3444 
3445  // List must terminate with item where both entry and next are NULL.
3446  // Team is added to the list only once.
3447  // List is sorted in ascending order by team id.
3448  // Team id is *not* a key.
3449 
3450  kmp_team_list_t l;
3451 
3452  KMP_DEBUG_ASSERT(list != NULL);
3453  if (team == NULL) {
3454  return;
3455  }
3456 
3457  __kmp_print_structure_team_accum(list, team->t.t_parent);
3458  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3459 
3460  // Search list for the team.
3461  l = list;
3462  while (l->next != NULL && l->entry != team) {
3463  l = l->next;
3464  }
3465  if (l->next != NULL) {
3466  return; // Team has been added before, exit.
3467  }
3468 
3469  // Team is not found. Search list again for insertion point.
3470  l = list;
3471  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3472  l = l->next;
3473  }
3474 
3475  // Insert team.
3476  {
3477  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3478  sizeof(kmp_team_list_item_t));
3479  *item = *l;
3480  l->entry = team;
3481  l->next = item;
3482  }
3483 }
3484 
3485 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3486 
3487 ) {
3488  __kmp_printf("%s", title);
3489  if (team != NULL) {
3490  __kmp_printf("%2x %p\n", team->t.t_id, team);
3491  } else {
3492  __kmp_printf(" - (nil)\n");
3493  }
3494 }
3495 
3496 static void __kmp_print_structure_thread(char const *title,
3497  kmp_info_p const *thread) {
3498  __kmp_printf("%s", title);
3499  if (thread != NULL) {
3500  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3501  } else {
3502  __kmp_printf(" - (nil)\n");
3503  }
3504 }
3505 
3506 void __kmp_print_structure(void) {
3507 
3508  kmp_team_list_t list;
3509 
3510  // Initialize list of teams.
3511  list =
3512  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3513  list->entry = NULL;
3514  list->next = NULL;
3515 
3516  __kmp_printf("\n------------------------------\nGlobal Thread "
3517  "Table\n------------------------------\n");
3518  {
3519  int gtid;
3520  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3521  __kmp_printf("%2d", gtid);
3522  if (__kmp_threads != NULL) {
3523  __kmp_printf(" %p", __kmp_threads[gtid]);
3524  }
3525  if (__kmp_root != NULL) {
3526  __kmp_printf(" %p", __kmp_root[gtid]);
3527  }
3528  __kmp_printf("\n");
3529  }
3530  }
3531 
3532  // Print out __kmp_threads array.
3533  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3534  "----------\n");
3535  if (__kmp_threads != NULL) {
3536  int gtid;
3537  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3538  kmp_info_t const *thread = __kmp_threads[gtid];
3539  if (thread != NULL) {
3540  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3541  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3542  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3543  __kmp_print_structure_team(" Serial Team: ",
3544  thread->th.th_serial_team);
3545  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3546  __kmp_print_structure_thread(" Primary: ",
3547  thread->th.th_team_master);
3548  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3549  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3550  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3551  __kmp_print_structure_thread(" Next in pool: ",
3552  thread->th.th_next_pool);
3553  __kmp_printf("\n");
3554  __kmp_print_structure_team_accum(list, thread->th.th_team);
3555  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3556  }
3557  }
3558  } else {
3559  __kmp_printf("Threads array is not allocated.\n");
3560  }
3561 
3562  // Print out __kmp_root array.
3563  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3564  "--------\n");
3565  if (__kmp_root != NULL) {
3566  int gtid;
3567  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3568  kmp_root_t const *root = __kmp_root[gtid];
3569  if (root != NULL) {
3570  __kmp_printf("GTID %2d %p:\n", gtid, root);
3571  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3572  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3573  __kmp_print_structure_thread(" Uber Thread: ",
3574  root->r.r_uber_thread);
3575  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3576  __kmp_printf(" In Parallel: %2d\n",
3577  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3578  __kmp_printf("\n");
3579  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3580  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3581  }
3582  }
3583  } else {
3584  __kmp_printf("Ubers array is not allocated.\n");
3585  }
3586 
3587  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3588  "--------\n");
3589  while (list->next != NULL) {
3590  kmp_team_p const *team = list->entry;
3591  int i;
3592  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3593  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3594  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3595  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3596  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3597  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3598  for (i = 0; i < team->t.t_nproc; ++i) {
3599  __kmp_printf(" Thread %2d: ", i);
3600  __kmp_print_structure_thread("", team->t.t_threads[i]);
3601  }
3602  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3603  __kmp_printf("\n");
3604  list = list->next;
3605  }
3606 
3607  // Print out __kmp_thread_pool and __kmp_team_pool.
3608  __kmp_printf("\n------------------------------\nPools\n----------------------"
3609  "--------\n");
3610  __kmp_print_structure_thread("Thread pool: ",
3611  CCAST(kmp_info_t *, __kmp_thread_pool));
3612  __kmp_print_structure_team("Team pool: ",
3613  CCAST(kmp_team_t *, __kmp_team_pool));
3614  __kmp_printf("\n");
3615 
3616  // Free team list.
3617  while (list != NULL) {
3618  kmp_team_list_item_t *item = list;
3619  list = list->next;
3620  KMP_INTERNAL_FREE(item);
3621  }
3622 }
3623 
3624 #endif
3625 
3626 //---------------------------------------------------------------------------
3627 // Stuff for per-thread fast random number generator
3628 // Table of primes
3629 static const unsigned __kmp_primes[] = {
3630  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3631  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3632  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3633  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3634  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3635  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3636  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3637  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3638  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3639  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3640  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3641 
3642 //---------------------------------------------------------------------------
3643 // __kmp_get_random: Get a random number using a linear congruential method.
3644 unsigned short __kmp_get_random(kmp_info_t *thread) {
3645  unsigned x = thread->th.th_x;
3646  unsigned short r = (unsigned short)(x >> 16);
3647 
3648  thread->th.th_x = x * thread->th.th_a + 1;
3649 
3650  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3651  thread->th.th_info.ds.ds_tid, r));
3652 
3653  return r;
3654 }
3655 //--------------------------------------------------------
3656 // __kmp_init_random: Initialize a random number generator
3657 void __kmp_init_random(kmp_info_t *thread) {
3658  unsigned seed = thread->th.th_info.ds.ds_tid;
3659 
3660  thread->th.th_a =
3661  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3662  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3663  KA_TRACE(30,
3664  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3665 }
3666 
3667 #if KMP_OS_WINDOWS
3668 /* reclaim array entries for root threads that are already dead, returns number
3669  * reclaimed */
3670 static int __kmp_reclaim_dead_roots(void) {
3671  int i, r = 0;
3672 
3673  for (i = 0; i < __kmp_threads_capacity; ++i) {
3674  if (KMP_UBER_GTID(i) &&
3675  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3676  !__kmp_root[i]
3677  ->r.r_active) { // AC: reclaim only roots died in non-active state
3678  r += __kmp_unregister_root_other_thread(i);
3679  }
3680  }
3681  return r;
3682 }
3683 #endif
3684 
3685 /* This function attempts to create free entries in __kmp_threads and
3686  __kmp_root, and returns the number of free entries generated.
3687 
3688  For Windows* OS static library, the first mechanism used is to reclaim array
3689  entries for root threads that are already dead.
3690 
3691  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3692  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3693  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3694  threadprivate cache array has been created. Synchronization with
3695  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3696 
3697  After any dead root reclamation, if the clipping value allows array expansion
3698  to result in the generation of a total of nNeed free slots, the function does
3699  that expansion. If not, nothing is done beyond the possible initial root
3700  thread reclamation.
3701 
3702  If any argument is negative, the behavior is undefined. */
3703 static int __kmp_expand_threads(int nNeed) {
3704  int added = 0;
3705  int minimumRequiredCapacity;
3706  int newCapacity;
3707  kmp_info_t **newThreads;
3708  kmp_root_t **newRoot;
3709 
3710  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3711  // resizing __kmp_threads does not need additional protection if foreign
3712  // threads are present
3713 
3714 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3715  /* only for Windows static library */
3716  /* reclaim array entries for root threads that are already dead */
3717  added = __kmp_reclaim_dead_roots();
3718 
3719  if (nNeed) {
3720  nNeed -= added;
3721  if (nNeed < 0)
3722  nNeed = 0;
3723  }
3724 #endif
3725  if (nNeed <= 0)
3726  return added;
3727 
3728  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3729  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3730  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3731  // > __kmp_max_nth in one of two ways:
3732  //
3733  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3734  // may not be reused by another thread, so we may need to increase
3735  // __kmp_threads_capacity to __kmp_max_nth + 1.
3736  //
3737  // 2) New foreign root(s) are encountered. We always register new foreign
3738  // roots. This may cause a smaller # of threads to be allocated at
3739  // subsequent parallel regions, but the worker threads hang around (and
3740  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3741  //
3742  // Anyway, that is the reason for moving the check to see if
3743  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3744  // instead of having it performed here. -BB
3745 
3746  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3747 
3748  /* compute expansion headroom to check if we can expand */
3749  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3750  /* possible expansion too small -- give up */
3751  return added;
3752  }
3753  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3754 
3755  newCapacity = __kmp_threads_capacity;
3756  do {
3757  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3758  : __kmp_sys_max_nth;
3759  } while (newCapacity < minimumRequiredCapacity);
3760  newThreads = (kmp_info_t **)__kmp_allocate(
3761  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3762  newRoot =
3763  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3764  KMP_MEMCPY(newThreads, __kmp_threads,
3765  __kmp_threads_capacity * sizeof(kmp_info_t *));
3766  KMP_MEMCPY(newRoot, __kmp_root,
3767  __kmp_threads_capacity * sizeof(kmp_root_t *));
3768  // Put old __kmp_threads array on a list. Any ongoing references to the old
3769  // list will be valid. This list is cleaned up at library shutdown.
3770  kmp_old_threads_list_t *node =
3771  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3772  node->threads = __kmp_threads;
3773  node->next = __kmp_old_threads_list;
3774  __kmp_old_threads_list = node;
3775 
3776  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3777  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3778  added += newCapacity - __kmp_threads_capacity;
3779  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3780 
3781  if (newCapacity > __kmp_tp_capacity) {
3782  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3783  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3784  __kmp_threadprivate_resize_cache(newCapacity);
3785  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3786  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3787  }
3788  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3789  }
3790 
3791  return added;
3792 }
3793 
3794 /* Register the current thread as a root thread and obtain our gtid. We must
3795  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3796  thread that calls from __kmp_do_serial_initialize() */
3797 int __kmp_register_root(int initial_thread) {
3798  kmp_info_t *root_thread;
3799  kmp_root_t *root;
3800  int gtid;
3801  int capacity;
3802  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3803  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3804  KMP_MB();
3805 
3806  /* 2007-03-02:
3807  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3808  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3809  work as expected -- it may return false (that means there is at least one
3810  empty slot in __kmp_threads array), but it is possible the only free slot
3811  is #0, which is reserved for initial thread and so cannot be used for this
3812  one. Following code workarounds this bug.
3813 
3814  However, right solution seems to be not reserving slot #0 for initial
3815  thread because:
3816  (1) there is no magic in slot #0,
3817  (2) we cannot detect initial thread reliably (the first thread which does
3818  serial initialization may be not a real initial thread).
3819  */
3820  capacity = __kmp_threads_capacity;
3821  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3822  --capacity;
3823  }
3824 
3825  // If it is not for initializing the hidden helper team, we need to take
3826  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3827  // in __kmp_threads_capacity.
3828  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3829  capacity -= __kmp_hidden_helper_threads_num;
3830  }
3831 
3832  /* see if there are too many threads */
3833  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3834  if (__kmp_tp_cached) {
3835  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3836  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3837  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3838  } else {
3839  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3840  __kmp_msg_null);
3841  }
3842  }
3843 
3844  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3845  // 0: initial thread, also a regular OpenMP thread.
3846  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3847  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3848  // regular OpenMP threads.
3849  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3850  // Find an available thread slot for hidden helper thread. Slots for hidden
3851  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3852  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3853  gtid <= __kmp_hidden_helper_threads_num;
3854  gtid++)
3855  ;
3856  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3857  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3858  "hidden helper thread: T#%d\n",
3859  gtid));
3860  } else {
3861  /* find an available thread slot */
3862  // Don't reassign the zero slot since we need that to only be used by
3863  // initial thread. Slots for hidden helper threads should also be skipped.
3864  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3865  gtid = 0;
3866  } else {
3867  for (gtid = __kmp_hidden_helper_threads_num + 1;
3868  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3869  ;
3870  }
3871  KA_TRACE(
3872  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3873  KMP_ASSERT(gtid < __kmp_threads_capacity);
3874  }
3875 
3876  /* update global accounting */
3877  __kmp_all_nth++;
3878  TCW_4(__kmp_nth, __kmp_nth + 1);
3879 
3880  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3881  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3882  if (__kmp_adjust_gtid_mode) {
3883  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3884  if (TCR_4(__kmp_gtid_mode) != 2) {
3885  TCW_4(__kmp_gtid_mode, 2);
3886  }
3887  } else {
3888  if (TCR_4(__kmp_gtid_mode) != 1) {
3889  TCW_4(__kmp_gtid_mode, 1);
3890  }
3891  }
3892  }
3893 
3894 #ifdef KMP_ADJUST_BLOCKTIME
3895  /* Adjust blocktime to zero if necessary */
3896  /* Middle initialization might not have occurred yet */
3897  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3898  if (__kmp_nth > __kmp_avail_proc) {
3899  __kmp_zero_bt = TRUE;
3900  }
3901  }
3902 #endif /* KMP_ADJUST_BLOCKTIME */
3903 
3904  /* setup this new hierarchy */
3905  if (!(root = __kmp_root[gtid])) {
3906  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3907  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3908  }
3909 
3910 #if KMP_STATS_ENABLED
3911  // Initialize stats as soon as possible (right after gtid assignment).
3912  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3913  __kmp_stats_thread_ptr->startLife();
3914  KMP_SET_THREAD_STATE(SERIAL_REGION);
3915  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3916 #endif
3917  __kmp_initialize_root(root);
3918 
3919  /* setup new root thread structure */
3920  if (root->r.r_uber_thread) {
3921  root_thread = root->r.r_uber_thread;
3922  } else {
3923  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3924  if (__kmp_storage_map) {
3925  __kmp_print_thread_storage_map(root_thread, gtid);
3926  }
3927  root_thread->th.th_info.ds.ds_gtid = gtid;
3928 #if OMPT_SUPPORT
3929  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3930 #endif
3931  root_thread->th.th_root = root;
3932  if (__kmp_env_consistency_check) {
3933  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3934  }
3935 #if USE_FAST_MEMORY
3936  __kmp_initialize_fast_memory(root_thread);
3937 #endif /* USE_FAST_MEMORY */
3938 
3939 #if KMP_USE_BGET
3940  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3941  __kmp_initialize_bget(root_thread);
3942 #endif
3943  __kmp_init_random(root_thread); // Initialize random number generator
3944  }
3945 
3946  /* setup the serial team held in reserve by the root thread */
3947  if (!root_thread->th.th_serial_team) {
3948  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3949  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3950  root_thread->th.th_serial_team = __kmp_allocate_team(
3951  root, 1, 1,
3952 #if OMPT_SUPPORT
3953  ompt_data_none, // root parallel id
3954 #endif
3955  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3956  }
3957  KMP_ASSERT(root_thread->th.th_serial_team);
3958  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3959  root_thread->th.th_serial_team));
3960 
3961  /* drop root_thread into place */
3962  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3963 
3964  root->r.r_root_team->t.t_threads[0] = root_thread;
3965  root->r.r_hot_team->t.t_threads[0] = root_thread;
3966  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3967  // AC: the team created in reserve, not for execution (it is unused for now).
3968  root_thread->th.th_serial_team->t.t_serialized = 0;
3969  root->r.r_uber_thread = root_thread;
3970 
3971  /* initialize the thread, get it ready to go */
3972  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3973  TCW_4(__kmp_init_gtid, TRUE);
3974 
3975  /* prepare the primary thread for get_gtid() */
3976  __kmp_gtid_set_specific(gtid);
3977 
3978 #if USE_ITT_BUILD
3979  __kmp_itt_thread_name(gtid);
3980 #endif /* USE_ITT_BUILD */
3981 
3982 #ifdef KMP_TDATA_GTID
3983  __kmp_gtid = gtid;
3984 #endif
3985  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3986  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3987 
3988  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3989  "plain=%u\n",
3990  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3991  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3992  KMP_INIT_BARRIER_STATE));
3993  { // Initialize barrier data.
3994  int b;
3995  for (b = 0; b < bs_last_barrier; ++b) {
3996  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3997 #if USE_DEBUGGER
3998  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3999 #endif
4000  }
4001  }
4002  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4003  KMP_INIT_BARRIER_STATE);
4004 
4005 #if KMP_AFFINITY_SUPPORTED
4006  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4007  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4008  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4009  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4010 #endif /* KMP_AFFINITY_SUPPORTED */
4011  root_thread->th.th_def_allocator = __kmp_def_allocator;
4012  root_thread->th.th_prev_level = 0;
4013  root_thread->th.th_prev_num_threads = 1;
4014 
4015  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4016  tmp->cg_root = root_thread;
4017  tmp->cg_thread_limit = __kmp_cg_max_nth;
4018  tmp->cg_nthreads = 1;
4019  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4020  " cg_nthreads init to 1\n",
4021  root_thread, tmp));
4022  tmp->up = NULL;
4023  root_thread->th.th_cg_roots = tmp;
4024 
4025  __kmp_root_counter++;
4026 
4027 #if OMPT_SUPPORT
4028  if (!initial_thread && ompt_enabled.enabled) {
4029 
4030  kmp_info_t *root_thread = ompt_get_thread();
4031 
4032  ompt_set_thread_state(root_thread, ompt_state_overhead);
4033 
4034  if (ompt_enabled.ompt_callback_thread_begin) {
4035  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4036  ompt_thread_initial, __ompt_get_thread_data_internal());
4037  }
4038  ompt_data_t *task_data;
4039  ompt_data_t *parallel_data;
4040  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4041  NULL);
4042  if (ompt_enabled.ompt_callback_implicit_task) {
4043  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4044  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4045  }
4046 
4047  ompt_set_thread_state(root_thread, ompt_state_work_serial);
4048  }
4049 #endif
4050 #if OMPD_SUPPORT
4051  if (ompd_state & OMPD_ENABLE_BP)
4052  ompd_bp_thread_begin();
4053 #endif
4054 
4055  KMP_MB();
4056  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4057 
4058  return gtid;
4059 }
4060 
4061 #if KMP_NESTED_HOT_TEAMS
4062 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4063  const int max_level) {
4064  int i, n, nth;
4065  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4066  if (!hot_teams || !hot_teams[level].hot_team) {
4067  return 0;
4068  }
4069  KMP_DEBUG_ASSERT(level < max_level);
4070  kmp_team_t *team = hot_teams[level].hot_team;
4071  nth = hot_teams[level].hot_team_nth;
4072  n = nth - 1; // primary thread is not freed
4073  if (level < max_level - 1) {
4074  for (i = 0; i < nth; ++i) {
4075  kmp_info_t *th = team->t.t_threads[i];
4076  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4077  if (i > 0 && th->th.th_hot_teams) {
4078  __kmp_free(th->th.th_hot_teams);
4079  th->th.th_hot_teams = NULL;
4080  }
4081  }
4082  }
4083  __kmp_free_team(root, team, NULL);
4084  return n;
4085 }
4086 #endif
4087 
4088 // Resets a root thread and clear its root and hot teams.
4089 // Returns the number of __kmp_threads entries directly and indirectly freed.
4090 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4091  kmp_team_t *root_team = root->r.r_root_team;
4092  kmp_team_t *hot_team = root->r.r_hot_team;
4093  int n = hot_team->t.t_nproc;
4094  int i;
4095 
4096  KMP_DEBUG_ASSERT(!root->r.r_active);
4097 
4098  root->r.r_root_team = NULL;
4099  root->r.r_hot_team = NULL;
4100  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4101  // before call to __kmp_free_team().
4102  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4103 #if KMP_NESTED_HOT_TEAMS
4104  if (__kmp_hot_teams_max_level >
4105  0) { // need to free nested hot teams and their threads if any
4106  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4107  kmp_info_t *th = hot_team->t.t_threads[i];
4108  if (__kmp_hot_teams_max_level > 1) {
4109  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4110  }
4111  if (th->th.th_hot_teams) {
4112  __kmp_free(th->th.th_hot_teams);
4113  th->th.th_hot_teams = NULL;
4114  }
4115  }
4116  }
4117 #endif
4118  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4119 
4120  // Before we can reap the thread, we need to make certain that all other
4121  // threads in the teams that had this root as ancestor have stopped trying to
4122  // steal tasks.
4123  if (__kmp_tasking_mode != tskm_immediate_exec) {
4124  __kmp_wait_to_unref_task_teams();
4125  }
4126 
4127 #if KMP_OS_WINDOWS
4128  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4129  KA_TRACE(
4130  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4131  "\n",
4132  (LPVOID) & (root->r.r_uber_thread->th),
4133  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4134  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4135 #endif /* KMP_OS_WINDOWS */
4136 
4137 #if OMPD_SUPPORT
4138  if (ompd_state & OMPD_ENABLE_BP)
4139  ompd_bp_thread_end();
4140 #endif
4141 
4142 #if OMPT_SUPPORT
4143  ompt_data_t *task_data;
4144  ompt_data_t *parallel_data;
4145  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4146  NULL);
4147  if (ompt_enabled.ompt_callback_implicit_task) {
4148  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4149  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4150  }
4151  if (ompt_enabled.ompt_callback_thread_end) {
4152  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4153  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4154  }
4155 #endif
4156 
4157  TCW_4(__kmp_nth,
4158  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4159  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4160  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4161  " to %d\n",
4162  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4163  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4164  if (i == 1) {
4165  // need to free contention group structure
4166  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4167  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4168  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4169  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4170  root->r.r_uber_thread->th.th_cg_roots = NULL;
4171  }
4172  __kmp_reap_thread(root->r.r_uber_thread, 1);
4173 
4174  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4175  // instead of freeing.
4176  root->r.r_uber_thread = NULL;
4177  /* mark root as no longer in use */
4178  root->r.r_begin = FALSE;
4179 
4180  return n;
4181 }
4182 
4183 void __kmp_unregister_root_current_thread(int gtid) {
4184  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4185  /* this lock should be ok, since unregister_root_current_thread is never
4186  called during an abort, only during a normal close. furthermore, if you
4187  have the forkjoin lock, you should never try to get the initz lock */
4188  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4189  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4190  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4191  "exiting T#%d\n",
4192  gtid));
4193  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4194  return;
4195  }
4196  kmp_root_t *root = __kmp_root[gtid];
4197 
4198  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4199  KMP_ASSERT(KMP_UBER_GTID(gtid));
4200  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4201  KMP_ASSERT(root->r.r_active == FALSE);
4202 
4203  KMP_MB();
4204 
4205  kmp_info_t *thread = __kmp_threads[gtid];
4206  kmp_team_t *team = thread->th.th_team;
4207  kmp_task_team_t *task_team = thread->th.th_task_team;
4208 
4209  // we need to wait for the proxy tasks before finishing the thread
4210  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4211  task_team->tt.tt_hidden_helper_task_encountered)) {
4212 #if OMPT_SUPPORT
4213  // the runtime is shutting down so we won't report any events
4214  thread->th.ompt_thread_info.state = ompt_state_undefined;
4215 #endif
4216  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4217  }
4218 
4219  __kmp_reset_root(gtid, root);
4220 
4221  KMP_MB();
4222  KC_TRACE(10,
4223  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4224 
4225  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4226 }
4227 
4228 #if KMP_OS_WINDOWS
4229 /* __kmp_forkjoin_lock must be already held
4230  Unregisters a root thread that is not the current thread. Returns the number
4231  of __kmp_threads entries freed as a result. */
4232 static int __kmp_unregister_root_other_thread(int gtid) {
4233  kmp_root_t *root = __kmp_root[gtid];
4234  int r;
4235 
4236  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4237  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4238  KMP_ASSERT(KMP_UBER_GTID(gtid));
4239  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4240  KMP_ASSERT(root->r.r_active == FALSE);
4241 
4242  r = __kmp_reset_root(gtid, root);
4243  KC_TRACE(10,
4244  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4245  return r;
4246 }
4247 #endif
4248 
4249 #if KMP_DEBUG
4250 void __kmp_task_info() {
4251 
4252  kmp_int32 gtid = __kmp_entry_gtid();
4253  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4254  kmp_info_t *this_thr = __kmp_threads[gtid];
4255  kmp_team_t *steam = this_thr->th.th_serial_team;
4256  kmp_team_t *team = this_thr->th.th_team;
4257 
4258  __kmp_printf(
4259  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4260  "ptask=%p\n",
4261  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4262  team->t.t_implicit_task_taskdata[tid].td_parent);
4263 }
4264 #endif // KMP_DEBUG
4265 
4266 /* TODO optimize with one big memclr, take out what isn't needed, split
4267  responsibility to workers as much as possible, and delay initialization of
4268  features as much as possible */
4269 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4270  int tid, int gtid) {
4271  /* this_thr->th.th_info.ds.ds_gtid is setup in
4272  kmp_allocate_thread/create_worker.
4273  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4274  KMP_DEBUG_ASSERT(this_thr != NULL);
4275  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4276  KMP_DEBUG_ASSERT(team);
4277  KMP_DEBUG_ASSERT(team->t.t_threads);
4278  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4279  kmp_info_t *master = team->t.t_threads[0];
4280  KMP_DEBUG_ASSERT(master);
4281  KMP_DEBUG_ASSERT(master->th.th_root);
4282 
4283  KMP_MB();
4284 
4285  TCW_SYNC_PTR(this_thr->th.th_team, team);
4286 
4287  this_thr->th.th_info.ds.ds_tid = tid;
4288  this_thr->th.th_set_nproc = 0;
4289  if (__kmp_tasking_mode != tskm_immediate_exec)
4290  // When tasking is possible, threads are not safe to reap until they are
4291  // done tasking; this will be set when tasking code is exited in wait
4292  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4293  else // no tasking --> always safe to reap
4294  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4295  this_thr->th.th_set_proc_bind = proc_bind_default;
4296 #if KMP_AFFINITY_SUPPORTED
4297  this_thr->th.th_new_place = this_thr->th.th_current_place;
4298 #endif
4299  this_thr->th.th_root = master->th.th_root;
4300 
4301  /* setup the thread's cache of the team structure */
4302  this_thr->th.th_team_nproc = team->t.t_nproc;
4303  this_thr->th.th_team_master = master;
4304  this_thr->th.th_team_serialized = team->t.t_serialized;
4305 
4306  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4307 
4308  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4309  tid, gtid, this_thr, this_thr->th.th_current_task));
4310 
4311  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4312  team, tid, TRUE);
4313 
4314  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4315  tid, gtid, this_thr, this_thr->th.th_current_task));
4316  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4317  // __kmp_initialize_team()?
4318 
4319  /* TODO no worksharing in speculative threads */
4320  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4321 
4322  this_thr->th.th_local.this_construct = 0;
4323 
4324  if (!this_thr->th.th_pri_common) {
4325  this_thr->th.th_pri_common =
4326  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4327  if (__kmp_storage_map) {
4328  __kmp_print_storage_map_gtid(
4329  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4330  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4331  }
4332  this_thr->th.th_pri_head = NULL;
4333  }
4334 
4335  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4336  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4337  // Make new thread's CG root same as primary thread's
4338  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4339  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4340  if (tmp) {
4341  // worker changes CG, need to check if old CG should be freed
4342  int i = tmp->cg_nthreads--;
4343  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4344  " on node %p of thread %p to %d\n",
4345  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4346  if (i == 1) {
4347  __kmp_free(tmp); // last thread left CG --> free it
4348  }
4349  }
4350  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4351  // Increment new thread's CG root's counter to add the new thread
4352  this_thr->th.th_cg_roots->cg_nthreads++;
4353  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4354  " node %p of thread %p to %d\n",
4355  this_thr, this_thr->th.th_cg_roots,
4356  this_thr->th.th_cg_roots->cg_root,
4357  this_thr->th.th_cg_roots->cg_nthreads));
4358  this_thr->th.th_current_task->td_icvs.thread_limit =
4359  this_thr->th.th_cg_roots->cg_thread_limit;
4360  }
4361 
4362  /* Initialize dynamic dispatch */
4363  {
4364  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4365  // Use team max_nproc since this will never change for the team.
4366  size_t disp_size =
4367  sizeof(dispatch_private_info_t) *
4368  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4369  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4370  team->t.t_max_nproc));
4371  KMP_ASSERT(dispatch);
4372  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4373  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4374 
4375  dispatch->th_disp_index = 0;
4376  dispatch->th_doacross_buf_idx = 0;
4377  if (!dispatch->th_disp_buffer) {
4378  dispatch->th_disp_buffer =
4379  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4380 
4381  if (__kmp_storage_map) {
4382  __kmp_print_storage_map_gtid(
4383  gtid, &dispatch->th_disp_buffer[0],
4384  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4385  ? 1
4386  : __kmp_dispatch_num_buffers],
4387  disp_size,
4388  "th_%d.th_dispatch.th_disp_buffer "
4389  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4390  gtid, team->t.t_id, gtid);
4391  }
4392  } else {
4393  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4394  }
4395 
4396  dispatch->th_dispatch_pr_current = 0;
4397  dispatch->th_dispatch_sh_current = 0;
4398 
4399  dispatch->th_deo_fcn = 0; /* ORDERED */
4400  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4401  }
4402 
4403  this_thr->th.th_next_pool = NULL;
4404 
4405  if (!this_thr->th.th_task_state_memo_stack) {
4406  size_t i;
4407  this_thr->th.th_task_state_memo_stack =
4408  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4409  this_thr->th.th_task_state_top = 0;
4410  this_thr->th.th_task_state_stack_sz = 4;
4411  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4412  ++i) // zero init the stack
4413  this_thr->th.th_task_state_memo_stack[i] = 0;
4414  }
4415 
4416  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4417  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4418 
4419  KMP_MB();
4420 }
4421 
4422 /* allocate a new thread for the requesting team. this is only called from
4423  within a forkjoin critical section. we will first try to get an available
4424  thread from the thread pool. if none is available, we will fork a new one
4425  assuming we are able to create a new one. this should be assured, as the
4426  caller should check on this first. */
4427 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4428  int new_tid) {
4429  kmp_team_t *serial_team;
4430  kmp_info_t *new_thr;
4431  int new_gtid;
4432 
4433  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4434  KMP_DEBUG_ASSERT(root && team);
4435 #if !KMP_NESTED_HOT_TEAMS
4436  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4437 #endif
4438  KMP_MB();
4439 
4440  /* first, try to get one from the thread pool */
4441  if (__kmp_thread_pool) {
4442  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4443  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4444  if (new_thr == __kmp_thread_pool_insert_pt) {
4445  __kmp_thread_pool_insert_pt = NULL;
4446  }
4447  TCW_4(new_thr->th.th_in_pool, FALSE);
4448  __kmp_suspend_initialize_thread(new_thr);
4449  __kmp_lock_suspend_mx(new_thr);
4450  if (new_thr->th.th_active_in_pool == TRUE) {
4451  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4452  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4453  new_thr->th.th_active_in_pool = FALSE;
4454  }
4455  __kmp_unlock_suspend_mx(new_thr);
4456 
4457  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4458  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4459  KMP_ASSERT(!new_thr->th.th_team);
4460  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4461 
4462  /* setup the thread structure */
4463  __kmp_initialize_info(new_thr, team, new_tid,
4464  new_thr->th.th_info.ds.ds_gtid);
4465  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4466 
4467  TCW_4(__kmp_nth, __kmp_nth + 1);
4468 
4469  new_thr->th.th_task_state = 0;
4470  new_thr->th.th_task_state_top = 0;
4471  new_thr->th.th_task_state_stack_sz = 4;
4472 
4473  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4474  // Make sure pool thread has transitioned to waiting on own thread struct
4475  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4476  // Thread activated in __kmp_allocate_team when increasing team size
4477  }
4478 
4479 #ifdef KMP_ADJUST_BLOCKTIME
4480  /* Adjust blocktime back to zero if necessary */
4481  /* Middle initialization might not have occurred yet */
4482  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4483  if (__kmp_nth > __kmp_avail_proc) {
4484  __kmp_zero_bt = TRUE;
4485  }
4486  }
4487 #endif /* KMP_ADJUST_BLOCKTIME */
4488 
4489 #if KMP_DEBUG
4490  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4491  // KMP_BARRIER_PARENT_FLAG.
4492  int b;
4493  kmp_balign_t *balign = new_thr->th.th_bar;
4494  for (b = 0; b < bs_last_barrier; ++b)
4495  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4496 #endif
4497 
4498  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4499  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4500 
4501  KMP_MB();
4502  return new_thr;
4503  }
4504 
4505  /* no, well fork a new one */
4506  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4507  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4508 
4509 #if KMP_USE_MONITOR
4510  // If this is the first worker thread the RTL is creating, then also
4511  // launch the monitor thread. We try to do this as early as possible.
4512  if (!TCR_4(__kmp_init_monitor)) {
4513  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4514  if (!TCR_4(__kmp_init_monitor)) {
4515  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4516  TCW_4(__kmp_init_monitor, 1);
4517  __kmp_create_monitor(&__kmp_monitor);
4518  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4519 #if KMP_OS_WINDOWS
4520  // AC: wait until monitor has started. This is a fix for CQ232808.
4521  // The reason is that if the library is loaded/unloaded in a loop with
4522  // small (parallel) work in between, then there is high probability that
4523  // monitor thread started after the library shutdown. At shutdown it is
4524  // too late to cope with the problem, because when the primary thread is
4525  // in DllMain (process detach) the monitor has no chances to start (it is
4526  // blocked), and primary thread has no means to inform the monitor that
4527  // the library has gone, because all the memory which the monitor can
4528  // access is going to be released/reset.
4529  while (TCR_4(__kmp_init_monitor) < 2) {
4530  KMP_YIELD(TRUE);
4531  }
4532  KF_TRACE(10, ("after monitor thread has started\n"));
4533 #endif
4534  }
4535  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4536  }
4537 #endif
4538 
4539  KMP_MB();
4540 
4541  {
4542  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4543  ? 1
4544  : __kmp_hidden_helper_threads_num + 1;
4545 
4546  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4547  ++new_gtid) {
4548  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4549  }
4550 
4551  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4552  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4553  }
4554  }
4555 
4556  /* allocate space for it. */
4557  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4558 
4559  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4560 
4561 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4562  // suppress race conditions detection on synchronization flags in debug mode
4563  // this helps to analyze library internals eliminating false positives
4564  __itt_suppress_mark_range(
4565  __itt_suppress_range, __itt_suppress_threading_errors,
4566  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4567  __itt_suppress_mark_range(
4568  __itt_suppress_range, __itt_suppress_threading_errors,
4569  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4570 #if KMP_OS_WINDOWS
4571  __itt_suppress_mark_range(
4572  __itt_suppress_range, __itt_suppress_threading_errors,
4573  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4574 #else
4575  __itt_suppress_mark_range(__itt_suppress_range,
4576  __itt_suppress_threading_errors,
4577  &new_thr->th.th_suspend_init_count,
4578  sizeof(new_thr->th.th_suspend_init_count));
4579 #endif
4580  // TODO: check if we need to also suppress b_arrived flags
4581  __itt_suppress_mark_range(__itt_suppress_range,
4582  __itt_suppress_threading_errors,
4583  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4584  sizeof(new_thr->th.th_bar[0].bb.b_go));
4585  __itt_suppress_mark_range(__itt_suppress_range,
4586  __itt_suppress_threading_errors,
4587  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4588  sizeof(new_thr->th.th_bar[1].bb.b_go));
4589  __itt_suppress_mark_range(__itt_suppress_range,
4590  __itt_suppress_threading_errors,
4591  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4592  sizeof(new_thr->th.th_bar[2].bb.b_go));
4593 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4594  if (__kmp_storage_map) {
4595  __kmp_print_thread_storage_map(new_thr, new_gtid);
4596  }
4597 
4598  // add the reserve serialized team, initialized from the team's primary thread
4599  {
4600  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4601  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4602  new_thr->th.th_serial_team = serial_team =
4603  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4604 #if OMPT_SUPPORT
4605  ompt_data_none, // root parallel id
4606 #endif
4607  proc_bind_default, &r_icvs,
4608  0 USE_NESTED_HOT_ARG(NULL));
4609  }
4610  KMP_ASSERT(serial_team);
4611  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4612  // execution (it is unused for now).
4613  serial_team->t.t_threads[0] = new_thr;
4614  KF_TRACE(10,
4615  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4616  new_thr));
4617 
4618  /* setup the thread structures */
4619  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4620 
4621 #if USE_FAST_MEMORY
4622  __kmp_initialize_fast_memory(new_thr);
4623 #endif /* USE_FAST_MEMORY */
4624 
4625 #if KMP_USE_BGET
4626  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4627  __kmp_initialize_bget(new_thr);
4628 #endif
4629 
4630  __kmp_init_random(new_thr); // Initialize random number generator
4631 
4632  /* Initialize these only once when thread is grabbed for a team allocation */
4633  KA_TRACE(20,
4634  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4635  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4636 
4637  int b;
4638  kmp_balign_t *balign = new_thr->th.th_bar;
4639  for (b = 0; b < bs_last_barrier; ++b) {
4640  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4641  balign[b].bb.team = NULL;
4642  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4643  balign[b].bb.use_oncore_barrier = 0;
4644  }
4645 
4646  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4647  new_thr->th.th_sleep_loc_type = flag_unset;
4648 
4649  new_thr->th.th_spin_here = FALSE;
4650  new_thr->th.th_next_waiting = 0;
4651 #if KMP_OS_UNIX
4652  new_thr->th.th_blocking = false;
4653 #endif
4654 
4655 #if KMP_AFFINITY_SUPPORTED
4656  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4657  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4658  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4659  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4660 #endif
4661  new_thr->th.th_def_allocator = __kmp_def_allocator;
4662  new_thr->th.th_prev_level = 0;
4663  new_thr->th.th_prev_num_threads = 1;
4664 
4665  TCW_4(new_thr->th.th_in_pool, FALSE);
4666  new_thr->th.th_active_in_pool = FALSE;
4667  TCW_4(new_thr->th.th_active, TRUE);
4668 
4669  /* adjust the global counters */
4670  __kmp_all_nth++;
4671  __kmp_nth++;
4672 
4673  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4674  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4675  if (__kmp_adjust_gtid_mode) {
4676  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4677  if (TCR_4(__kmp_gtid_mode) != 2) {
4678  TCW_4(__kmp_gtid_mode, 2);
4679  }
4680  } else {
4681  if (TCR_4(__kmp_gtid_mode) != 1) {
4682  TCW_4(__kmp_gtid_mode, 1);
4683  }
4684  }
4685  }
4686 
4687 #ifdef KMP_ADJUST_BLOCKTIME
4688  /* Adjust blocktime back to zero if necessary */
4689  /* Middle initialization might not have occurred yet */
4690  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4691  if (__kmp_nth > __kmp_avail_proc) {
4692  __kmp_zero_bt = TRUE;
4693  }
4694  }
4695 #endif /* KMP_ADJUST_BLOCKTIME */
4696 
4697 #if KMP_AFFINITY_SUPPORTED
4698  // Set the affinity and topology information for new thread
4699  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4700 #endif
4701 
4702  /* actually fork it and create the new worker thread */
4703  KF_TRACE(
4704  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4705  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4706  KF_TRACE(10,
4707  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4708 
4709  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4710  new_gtid));
4711  KMP_MB();
4712  return new_thr;
4713 }
4714 
4715 /* Reinitialize team for reuse.
4716  The hot team code calls this case at every fork barrier, so EPCC barrier
4717  test are extremely sensitive to changes in it, esp. writes to the team
4718  struct, which cause a cache invalidation in all threads.
4719  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4720 static void __kmp_reinitialize_team(kmp_team_t *team,
4721  kmp_internal_control_t *new_icvs,
4722  ident_t *loc) {
4723  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4724  team->t.t_threads[0], team));
4725  KMP_DEBUG_ASSERT(team && new_icvs);
4726  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4727  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4728 
4729  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4730  // Copy ICVs to the primary thread's implicit taskdata
4731  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4732  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4733 
4734  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4735  team->t.t_threads[0], team));
4736 }
4737 
4738 /* Initialize the team data structure.
4739  This assumes the t_threads and t_max_nproc are already set.
4740  Also, we don't touch the arguments */
4741 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4742  kmp_internal_control_t *new_icvs,
4743  ident_t *loc) {
4744  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4745 
4746  /* verify */
4747  KMP_DEBUG_ASSERT(team);
4748  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4749  KMP_DEBUG_ASSERT(team->t.t_threads);
4750  KMP_MB();
4751 
4752  team->t.t_master_tid = 0; /* not needed */
4753  /* team->t.t_master_bar; not needed */
4754  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4755  team->t.t_nproc = new_nproc;
4756 
4757  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4758  team->t.t_next_pool = NULL;
4759  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4760  * up hot team */
4761 
4762  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4763  team->t.t_invoke = NULL; /* not needed */
4764 
4765  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4766  team->t.t_sched.sched = new_icvs->sched.sched;
4767 
4768 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4769  team->t.t_fp_control_saved = FALSE; /* not needed */
4770  team->t.t_x87_fpu_control_word = 0; /* not needed */
4771  team->t.t_mxcsr = 0; /* not needed */
4772 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4773 
4774  team->t.t_construct = 0;
4775 
4776  team->t.t_ordered.dt.t_value = 0;
4777  team->t.t_master_active = FALSE;
4778 
4779 #ifdef KMP_DEBUG
4780  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4781 #endif
4782 #if KMP_OS_WINDOWS
4783  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4784 #endif
4785 
4786  team->t.t_control_stack_top = NULL;
4787 
4788  __kmp_reinitialize_team(team, new_icvs, loc);
4789 
4790  KMP_MB();
4791  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4792 }
4793 
4794 #if KMP_AFFINITY_SUPPORTED
4795 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4796  int first, int last, int newp) {
4797  th->th.th_first_place = first;
4798  th->th.th_last_place = last;
4799  th->th.th_new_place = newp;
4800  if (newp != th->th.th_current_place) {
4801  if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4802  team->t.t_display_affinity = 1;
4803  // Copy topology information associated with the new place
4804  th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4805  th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4806  }
4807 }
4808 
4809 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4810 // It calculates the worker + primary thread's partition based upon the parent
4811 // thread's partition, and binds each worker to a thread in their partition.
4812 // The primary thread's partition should already include its current binding.
4813 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4814  // Do not partition places for the hidden helper team
4815  if (KMP_HIDDEN_HELPER_TEAM(team))
4816  return;
4817  // Copy the primary thread's place partition to the team struct
4818  kmp_info_t *master_th = team->t.t_threads[0];
4819  KMP_DEBUG_ASSERT(master_th != NULL);
4820  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4821  int first_place = master_th->th.th_first_place;
4822  int last_place = master_th->th.th_last_place;
4823  int masters_place = master_th->th.th_current_place;
4824  int num_masks = __kmp_affinity.num_masks;
4825  team->t.t_first_place = first_place;
4826  team->t.t_last_place = last_place;
4827 
4828  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4829  "bound to place %d partition = [%d,%d]\n",
4830  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4831  team->t.t_id, masters_place, first_place, last_place));
4832 
4833  switch (proc_bind) {
4834 
4835  case proc_bind_default:
4836  // Serial teams might have the proc_bind policy set to proc_bind_default.
4837  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4838  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4839  break;
4840 
4841  case proc_bind_primary: {
4842  int f;
4843  int n_th = team->t.t_nproc;
4844  for (f = 1; f < n_th; f++) {
4845  kmp_info_t *th = team->t.t_threads[f];
4846  KMP_DEBUG_ASSERT(th != NULL);
4847  __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4848 
4849  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4850  "partition = [%d,%d]\n",
4851  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4852  f, masters_place, first_place, last_place));
4853  }
4854  } break;
4855 
4856  case proc_bind_close: {
4857  int f;
4858  int n_th = team->t.t_nproc;
4859  int n_places;
4860  if (first_place <= last_place) {
4861  n_places = last_place - first_place + 1;
4862  } else {
4863  n_places = num_masks - first_place + last_place + 1;
4864  }
4865  if (n_th <= n_places) {
4866  int place = masters_place;
4867  for (f = 1; f < n_th; f++) {
4868  kmp_info_t *th = team->t.t_threads[f];
4869  KMP_DEBUG_ASSERT(th != NULL);
4870 
4871  if (place == last_place) {
4872  place = first_place;
4873  } else if (place == (num_masks - 1)) {
4874  place = 0;
4875  } else {
4876  place++;
4877  }
4878  __kmp_set_thread_place(team, th, first_place, last_place, place);
4879 
4880  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4881  "partition = [%d,%d]\n",
4882  __kmp_gtid_from_thread(team->t.t_threads[f]),
4883  team->t.t_id, f, place, first_place, last_place));
4884  }
4885  } else {
4886  int S, rem, gap, s_count;
4887  S = n_th / n_places;
4888  s_count = 0;
4889  rem = n_th - (S * n_places);
4890  gap = rem > 0 ? n_places / rem : n_places;
4891  int place = masters_place;
4892  int gap_ct = gap;
4893  for (f = 0; f < n_th; f++) {
4894  kmp_info_t *th = team->t.t_threads[f];
4895  KMP_DEBUG_ASSERT(th != NULL);
4896 
4897  __kmp_set_thread_place(team, th, first_place, last_place, place);
4898  s_count++;
4899 
4900  if ((s_count == S) && rem && (gap_ct == gap)) {
4901  // do nothing, add an extra thread to place on next iteration
4902  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4903  // we added an extra thread to this place; move to next place
4904  if (place == last_place) {
4905  place = first_place;
4906  } else if (place == (num_masks - 1)) {
4907  place = 0;
4908  } else {
4909  place++;
4910  }
4911  s_count = 0;
4912  gap_ct = 1;
4913  rem--;
4914  } else if (s_count == S) { // place full; don't add extra
4915  if (place == last_place) {
4916  place = first_place;
4917  } else if (place == (num_masks - 1)) {
4918  place = 0;
4919  } else {
4920  place++;
4921  }
4922  gap_ct++;
4923  s_count = 0;
4924  }
4925 
4926  KA_TRACE(100,
4927  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4928  "partition = [%d,%d]\n",
4929  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4930  th->th.th_new_place, first_place, last_place));
4931  }
4932  KMP_DEBUG_ASSERT(place == masters_place);
4933  }
4934  } break;
4935 
4936  case proc_bind_spread: {
4937  int f;
4938  int n_th = team->t.t_nproc;
4939  int n_places;
4940  int thidx;
4941  if (first_place <= last_place) {
4942  n_places = last_place - first_place + 1;
4943  } else {
4944  n_places = num_masks - first_place + last_place + 1;
4945  }
4946  if (n_th <= n_places) {
4947  int place = -1;
4948 
4949  if (n_places != num_masks) {
4950  int S = n_places / n_th;
4951  int s_count, rem, gap, gap_ct;
4952 
4953  place = masters_place;
4954  rem = n_places - n_th * S;
4955  gap = rem ? n_th / rem : 1;
4956  gap_ct = gap;
4957  thidx = n_th;
4958  if (update_master_only == 1)
4959  thidx = 1;
4960  for (f = 0; f < thidx; f++) {
4961  kmp_info_t *th = team->t.t_threads[f];
4962  KMP_DEBUG_ASSERT(th != NULL);
4963 
4964  int fplace = place, nplace = place;
4965  s_count = 1;
4966  while (s_count < S) {
4967  if (place == last_place) {
4968  place = first_place;
4969  } else if (place == (num_masks - 1)) {
4970  place = 0;
4971  } else {
4972  place++;
4973  }
4974  s_count++;
4975  }
4976  if (rem && (gap_ct == gap)) {
4977  if (place == last_place) {
4978  place = first_place;
4979  } else if (place == (num_masks - 1)) {
4980  place = 0;
4981  } else {
4982  place++;
4983  }
4984  rem--;
4985  gap_ct = 0;
4986  }
4987  __kmp_set_thread_place(team, th, fplace, place, nplace);
4988  gap_ct++;
4989 
4990  if (place == last_place) {
4991  place = first_place;
4992  } else if (place == (num_masks - 1)) {
4993  place = 0;
4994  } else {
4995  place++;
4996  }
4997 
4998  KA_TRACE(100,
4999  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5000  "partition = [%d,%d], num_masks: %u\n",
5001  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5002  f, th->th.th_new_place, th->th.th_first_place,
5003  th->th.th_last_place, num_masks));
5004  }
5005  } else {
5006  /* Having uniform space of available computation places I can create
5007  T partitions of round(P/T) size and put threads into the first
5008  place of each partition. */
5009  double current = static_cast<double>(masters_place);
5010  double spacing =
5011  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5012  int first, last;
5013  kmp_info_t *th;
5014 
5015  thidx = n_th + 1;
5016  if (update_master_only == 1)
5017  thidx = 1;
5018  for (f = 0; f < thidx; f++) {
5019  first = static_cast<int>(current);
5020  last = static_cast<int>(current + spacing) - 1;
5021  KMP_DEBUG_ASSERT(last >= first);
5022  if (first >= n_places) {
5023  if (masters_place) {
5024  first -= n_places;
5025  last -= n_places;
5026  if (first == (masters_place + 1)) {
5027  KMP_DEBUG_ASSERT(f == n_th);
5028  first--;
5029  }
5030  if (last == masters_place) {
5031  KMP_DEBUG_ASSERT(f == (n_th - 1));
5032  last--;
5033  }
5034  } else {
5035  KMP_DEBUG_ASSERT(f == n_th);
5036  first = 0;
5037  last = 0;
5038  }
5039  }
5040  if (last >= n_places) {
5041  last = (n_places - 1);
5042  }
5043  place = first;
5044  current += spacing;
5045  if (f < n_th) {
5046  KMP_DEBUG_ASSERT(0 <= first);
5047  KMP_DEBUG_ASSERT(n_places > first);
5048  KMP_DEBUG_ASSERT(0 <= last);
5049  KMP_DEBUG_ASSERT(n_places > last);
5050  KMP_DEBUG_ASSERT(last_place >= first_place);
5051  th = team->t.t_threads[f];
5052  KMP_DEBUG_ASSERT(th);
5053  __kmp_set_thread_place(team, th, first, last, place);
5054  KA_TRACE(100,
5055  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5056  "partition = [%d,%d], spacing = %.4f\n",
5057  __kmp_gtid_from_thread(team->t.t_threads[f]),
5058  team->t.t_id, f, th->th.th_new_place,
5059  th->th.th_first_place, th->th.th_last_place, spacing));
5060  }
5061  }
5062  }
5063  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5064  } else {
5065  int S, rem, gap, s_count;
5066  S = n_th / n_places;
5067  s_count = 0;
5068  rem = n_th - (S * n_places);
5069  gap = rem > 0 ? n_places / rem : n_places;
5070  int place = masters_place;
5071  int gap_ct = gap;
5072  thidx = n_th;
5073  if (update_master_only == 1)
5074  thidx = 1;
5075  for (f = 0; f < thidx; f++) {
5076  kmp_info_t *th = team->t.t_threads[f];
5077  KMP_DEBUG_ASSERT(th != NULL);
5078 
5079  __kmp_set_thread_place(team, th, place, place, place);
5080  s_count++;
5081 
5082  if ((s_count == S) && rem && (gap_ct == gap)) {
5083  // do nothing, add an extra thread to place on next iteration
5084  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5085  // we added an extra thread to this place; move on to next place
5086  if (place == last_place) {
5087  place = first_place;
5088  } else if (place == (num_masks - 1)) {
5089  place = 0;
5090  } else {
5091  place++;
5092  }
5093  s_count = 0;
5094  gap_ct = 1;
5095  rem--;
5096  } else if (s_count == S) { // place is full; don't add extra thread
5097  if (place == last_place) {
5098  place = first_place;
5099  } else if (place == (num_masks - 1)) {
5100  place = 0;
5101  } else {
5102  place++;
5103  }
5104  gap_ct++;
5105  s_count = 0;
5106  }
5107 
5108  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5109  "partition = [%d,%d]\n",
5110  __kmp_gtid_from_thread(team->t.t_threads[f]),
5111  team->t.t_id, f, th->th.th_new_place,
5112  th->th.th_first_place, th->th.th_last_place));
5113  }
5114  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5115  }
5116  } break;
5117 
5118  default:
5119  break;
5120  }
5121 
5122  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5123 }
5124 
5125 #endif // KMP_AFFINITY_SUPPORTED
5126 
5127 /* allocate a new team data structure to use. take one off of the free pool if
5128  available */
5129 kmp_team_t *
5130 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5131 #if OMPT_SUPPORT
5132  ompt_data_t ompt_parallel_data,
5133 #endif
5134  kmp_proc_bind_t new_proc_bind,
5135  kmp_internal_control_t *new_icvs,
5136  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5137  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5138  int f;
5139  kmp_team_t *team;
5140  int use_hot_team = !root->r.r_active;
5141  int level = 0;
5142  int do_place_partition = 1;
5143 
5144  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5145  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5146  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5147  KMP_MB();
5148 
5149 #if KMP_NESTED_HOT_TEAMS
5150  kmp_hot_team_ptr_t *hot_teams;
5151  if (master) {
5152  team = master->th.th_team;
5153  level = team->t.t_active_level;
5154  if (master->th.th_teams_microtask) { // in teams construct?
5155  if (master->th.th_teams_size.nteams > 1 &&
5156  ( // #teams > 1
5157  team->t.t_pkfn ==
5158  (microtask_t)__kmp_teams_master || // inner fork of the teams
5159  master->th.th_teams_level <
5160  team->t.t_level)) { // or nested parallel inside the teams
5161  ++level; // not increment if #teams==1, or for outer fork of the teams;
5162  // increment otherwise
5163  }
5164  // Do not perform the place partition if inner fork of the teams
5165  // Wait until nested parallel region encountered inside teams construct
5166  if ((master->th.th_teams_size.nteams == 1 &&
5167  master->th.th_teams_level >= team->t.t_level) ||
5168  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5169  do_place_partition = 0;
5170  }
5171  hot_teams = master->th.th_hot_teams;
5172  if (level < __kmp_hot_teams_max_level && hot_teams &&
5173  hot_teams[level].hot_team) {
5174  // hot team has already been allocated for given level
5175  use_hot_team = 1;
5176  } else {
5177  use_hot_team = 0;
5178  }
5179  } else {
5180  // check we won't access uninitialized hot_teams, just in case
5181  KMP_DEBUG_ASSERT(new_nproc == 1);
5182  }
5183 #endif
5184  // Optimization to use a "hot" team
5185  if (use_hot_team && new_nproc > 1) {
5186  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5187 #if KMP_NESTED_HOT_TEAMS
5188  team = hot_teams[level].hot_team;
5189 #else
5190  team = root->r.r_hot_team;
5191 #endif
5192 #if KMP_DEBUG
5193  if (__kmp_tasking_mode != tskm_immediate_exec) {
5194  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5195  "task_team[1] = %p before reinit\n",
5196  team->t.t_task_team[0], team->t.t_task_team[1]));
5197  }
5198 #endif
5199 
5200  if (team->t.t_nproc != new_nproc &&
5201  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5202  // Distributed barrier may need a resize
5203  int old_nthr = team->t.t_nproc;
5204  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5205  }
5206 
5207  // If not doing the place partition, then reset the team's proc bind
5208  // to indicate that partitioning of all threads still needs to take place
5209  if (do_place_partition == 0)
5210  team->t.t_proc_bind = proc_bind_default;
5211  // Has the number of threads changed?
5212  /* Let's assume the most common case is that the number of threads is
5213  unchanged, and put that case first. */
5214  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5215  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5216  // This case can mean that omp_set_num_threads() was called and the hot
5217  // team size was already reduced, so we check the special flag
5218  if (team->t.t_size_changed == -1) {
5219  team->t.t_size_changed = 1;
5220  } else {
5221  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5222  }
5223 
5224  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5225  kmp_r_sched_t new_sched = new_icvs->sched;
5226  // set primary thread's schedule as new run-time schedule
5227  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5228 
5229  __kmp_reinitialize_team(team, new_icvs,
5230  root->r.r_uber_thread->th.th_ident);
5231 
5232  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5233  team->t.t_threads[0], team));
5234  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5235 
5236 #if KMP_AFFINITY_SUPPORTED
5237  if ((team->t.t_size_changed == 0) &&
5238  (team->t.t_proc_bind == new_proc_bind)) {
5239  if (new_proc_bind == proc_bind_spread) {
5240  if (do_place_partition) {
5241  // add flag to update only master for spread
5242  __kmp_partition_places(team, 1);
5243  }
5244  }
5245  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5246  "proc_bind = %d, partition = [%d,%d]\n",
5247  team->t.t_id, new_proc_bind, team->t.t_first_place,
5248  team->t.t_last_place));
5249  } else {
5250  if (do_place_partition) {
5251  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5252  __kmp_partition_places(team);
5253  }
5254  }
5255 #else
5256  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5257 #endif /* KMP_AFFINITY_SUPPORTED */
5258  } else if (team->t.t_nproc > new_nproc) {
5259  KA_TRACE(20,
5260  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5261  new_nproc));
5262 
5263  team->t.t_size_changed = 1;
5264  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5265  // Barrier size already reduced earlier in this function
5266  // Activate team threads via th_used_in_team
5267  __kmp_add_threads_to_team(team, new_nproc);
5268  }
5269 #if KMP_NESTED_HOT_TEAMS
5270  if (__kmp_hot_teams_mode == 0) {
5271  // AC: saved number of threads should correspond to team's value in this
5272  // mode, can be bigger in mode 1, when hot team has threads in reserve
5273  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5274  hot_teams[level].hot_team_nth = new_nproc;
5275 #endif // KMP_NESTED_HOT_TEAMS
5276  /* release the extra threads we don't need any more */
5277  for (f = new_nproc; f < team->t.t_nproc; f++) {
5278  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5279  if (__kmp_tasking_mode != tskm_immediate_exec) {
5280  // When decreasing team size, threads no longer in the team should
5281  // unref task team.
5282  team->t.t_threads[f]->th.th_task_team = NULL;
5283  }
5284  __kmp_free_thread(team->t.t_threads[f]);
5285  team->t.t_threads[f] = NULL;
5286  }
5287 #if KMP_NESTED_HOT_TEAMS
5288  } // (__kmp_hot_teams_mode == 0)
5289  else {
5290  // When keeping extra threads in team, switch threads to wait on own
5291  // b_go flag
5292  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5293  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5294  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5295  for (int b = 0; b < bs_last_barrier; ++b) {
5296  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5297  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5298  }
5299  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5300  }
5301  }
5302  }
5303 #endif // KMP_NESTED_HOT_TEAMS
5304  team->t.t_nproc = new_nproc;
5305  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5306  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5307  __kmp_reinitialize_team(team, new_icvs,
5308  root->r.r_uber_thread->th.th_ident);
5309 
5310  // Update remaining threads
5311  for (f = 0; f < new_nproc; ++f) {
5312  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5313  }
5314 
5315  // restore the current task state of the primary thread: should be the
5316  // implicit task
5317  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5318  team->t.t_threads[0], team));
5319 
5320  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5321 
5322 #ifdef KMP_DEBUG
5323  for (f = 0; f < team->t.t_nproc; f++) {
5324  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5325  team->t.t_threads[f]->th.th_team_nproc ==
5326  team->t.t_nproc);
5327  }
5328 #endif
5329 
5330  if (do_place_partition) {
5331  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332 #if KMP_AFFINITY_SUPPORTED
5333  __kmp_partition_places(team);
5334 #endif
5335  }
5336  } else { // team->t.t_nproc < new_nproc
5337 
5338  KA_TRACE(20,
5339  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5340  new_nproc));
5341  int old_nproc = team->t.t_nproc; // save old value and use to update only
5342  team->t.t_size_changed = 1;
5343 
5344 #if KMP_NESTED_HOT_TEAMS
5345  int avail_threads = hot_teams[level].hot_team_nth;
5346  if (new_nproc < avail_threads)
5347  avail_threads = new_nproc;
5348  kmp_info_t **other_threads = team->t.t_threads;
5349  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5350  // Adjust barrier data of reserved threads (if any) of the team
5351  // Other data will be set in __kmp_initialize_info() below.
5352  int b;
5353  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5354  for (b = 0; b < bs_last_barrier; ++b) {
5355  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5356  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5357 #if USE_DEBUGGER
5358  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5359 #endif
5360  }
5361  }
5362  if (hot_teams[level].hot_team_nth >= new_nproc) {
5363  // we have all needed threads in reserve, no need to allocate any
5364  // this only possible in mode 1, cannot have reserved threads in mode 0
5365  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5366  team->t.t_nproc = new_nproc; // just get reserved threads involved
5367  } else {
5368  // We may have some threads in reserve, but not enough;
5369  // get reserved threads involved if any.
5370  team->t.t_nproc = hot_teams[level].hot_team_nth;
5371  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5372 #endif // KMP_NESTED_HOT_TEAMS
5373  if (team->t.t_max_nproc < new_nproc) {
5374  /* reallocate larger arrays */
5375  __kmp_reallocate_team_arrays(team, new_nproc);
5376  __kmp_reinitialize_team(team, new_icvs, NULL);
5377  }
5378 
5379 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5380  /* Temporarily set full mask for primary thread before creation of
5381  workers. The reason is that workers inherit the affinity from the
5382  primary thread, so if a lot of workers are created on the single
5383  core quickly, they don't get a chance to set their own affinity for
5384  a long time. */
5385  kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5386 #endif
5387 
5388  /* allocate new threads for the hot team */
5389  for (f = team->t.t_nproc; f < new_nproc; f++) {
5390  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5391  KMP_DEBUG_ASSERT(new_worker);
5392  team->t.t_threads[f] = new_worker;
5393 
5394  KA_TRACE(20,
5395  ("__kmp_allocate_team: team %d init T#%d arrived: "
5396  "join=%llu, plain=%llu\n",
5397  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5398  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5399  team->t.t_bar[bs_plain_barrier].b_arrived));
5400 
5401  { // Initialize barrier data for new threads.
5402  int b;
5403  kmp_balign_t *balign = new_worker->th.th_bar;
5404  for (b = 0; b < bs_last_barrier; ++b) {
5405  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5406  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5407  KMP_BARRIER_PARENT_FLAG);
5408 #if USE_DEBUGGER
5409  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5410 #endif
5411  }
5412  }
5413  }
5414 
5415 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5416  /* Restore initial primary thread's affinity mask */
5417  new_temp_affinity.restore();
5418 #endif
5419 #if KMP_NESTED_HOT_TEAMS
5420  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5421 #endif // KMP_NESTED_HOT_TEAMS
5422  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5423  // Barrier size already increased earlier in this function
5424  // Activate team threads via th_used_in_team
5425  __kmp_add_threads_to_team(team, new_nproc);
5426  }
5427  /* make sure everyone is syncronized */
5428  // new threads below
5429  __kmp_initialize_team(team, new_nproc, new_icvs,
5430  root->r.r_uber_thread->th.th_ident);
5431 
5432  /* reinitialize the threads */
5433  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5434  for (f = 0; f < team->t.t_nproc; ++f)
5435  __kmp_initialize_info(team->t.t_threads[f], team, f,
5436  __kmp_gtid_from_tid(f, team));
5437 
5438  // set th_task_state for new threads in hot team with older thread's state
5439  kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5440  for (f = old_nproc; f < team->t.t_nproc; ++f)
5441  team->t.t_threads[f]->th.th_task_state = old_state;
5442 
5443 #ifdef KMP_DEBUG
5444  for (f = 0; f < team->t.t_nproc; ++f) {
5445  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5446  team->t.t_threads[f]->th.th_team_nproc ==
5447  team->t.t_nproc);
5448  }
5449 #endif
5450 
5451  if (do_place_partition) {
5452  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5453 #if KMP_AFFINITY_SUPPORTED
5454  __kmp_partition_places(team);
5455 #endif
5456  }
5457  } // Check changes in number of threads
5458 
5459  kmp_info_t *master = team->t.t_threads[0];
5460  if (master->th.th_teams_microtask) {
5461  for (f = 1; f < new_nproc; ++f) {
5462  // propagate teams construct specific info to workers
5463  kmp_info_t *thr = team->t.t_threads[f];
5464  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5465  thr->th.th_teams_level = master->th.th_teams_level;
5466  thr->th.th_teams_size = master->th.th_teams_size;
5467  }
5468  }
5469 #if KMP_NESTED_HOT_TEAMS
5470  if (level) {
5471  // Sync barrier state for nested hot teams, not needed for outermost hot
5472  // team.
5473  for (f = 1; f < new_nproc; ++f) {
5474  kmp_info_t *thr = team->t.t_threads[f];
5475  int b;
5476  kmp_balign_t *balign = thr->th.th_bar;
5477  for (b = 0; b < bs_last_barrier; ++b) {
5478  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5479  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5480 #if USE_DEBUGGER
5481  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5482 #endif
5483  }
5484  }
5485  }
5486 #endif // KMP_NESTED_HOT_TEAMS
5487 
5488  /* reallocate space for arguments if necessary */
5489  __kmp_alloc_argv_entries(argc, team, TRUE);
5490  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5491  // The hot team re-uses the previous task team,
5492  // if untouched during the previous release->gather phase.
5493 
5494  KF_TRACE(10, (" hot_team = %p\n", team));
5495 
5496 #if KMP_DEBUG
5497  if (__kmp_tasking_mode != tskm_immediate_exec) {
5498  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5499  "task_team[1] = %p after reinit\n",
5500  team->t.t_task_team[0], team->t.t_task_team[1]));
5501  }
5502 #endif
5503 
5504 #if OMPT_SUPPORT
5505  __ompt_team_assign_id(team, ompt_parallel_data);
5506 #endif
5507 
5508  KMP_MB();
5509 
5510  return team;
5511  }
5512 
5513  /* next, let's try to take one from the team pool */
5514  KMP_MB();
5515  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5516  /* TODO: consider resizing undersized teams instead of reaping them, now
5517  that we have a resizing mechanism */
5518  if (team->t.t_max_nproc >= max_nproc) {
5519  /* take this team from the team pool */
5520  __kmp_team_pool = team->t.t_next_pool;
5521 
5522  if (max_nproc > 1 &&
5523  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5524  if (!team->t.b) { // Allocate barrier structure
5525  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5526  }
5527  }
5528 
5529  /* setup the team for fresh use */
5530  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5531 
5532  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5533  "task_team[1] %p to NULL\n",
5534  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5535  team->t.t_task_team[0] = NULL;
5536  team->t.t_task_team[1] = NULL;
5537 
5538  /* reallocate space for arguments if necessary */
5539  __kmp_alloc_argv_entries(argc, team, TRUE);
5540  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5541 
5542  KA_TRACE(
5543  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5544  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5545  { // Initialize barrier data.
5546  int b;
5547  for (b = 0; b < bs_last_barrier; ++b) {
5548  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5549 #if USE_DEBUGGER
5550  team->t.t_bar[b].b_master_arrived = 0;
5551  team->t.t_bar[b].b_team_arrived = 0;
5552 #endif
5553  }
5554  }
5555 
5556  team->t.t_proc_bind = new_proc_bind;
5557 
5558  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5559  team->t.t_id));
5560 
5561 #if OMPT_SUPPORT
5562  __ompt_team_assign_id(team, ompt_parallel_data);
5563 #endif
5564 
5565  KMP_MB();
5566 
5567  return team;
5568  }
5569 
5570  /* reap team if it is too small, then loop back and check the next one */
5571  // not sure if this is wise, but, will be redone during the hot-teams
5572  // rewrite.
5573  /* TODO: Use technique to find the right size hot-team, don't reap them */
5574  team = __kmp_reap_team(team);
5575  __kmp_team_pool = team;
5576  }
5577 
5578  /* nothing available in the pool, no matter, make a new team! */
5579  KMP_MB();
5580  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5581 
5582  /* and set it up */
5583  team->t.t_max_nproc = max_nproc;
5584  if (max_nproc > 1 &&
5585  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5586  // Allocate barrier structure
5587  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5588  }
5589 
5590  /* NOTE well, for some reason allocating one big buffer and dividing it up
5591  seems to really hurt performance a lot on the P4, so, let's not use this */
5592  __kmp_allocate_team_arrays(team, max_nproc);
5593 
5594  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5595  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5596 
5597  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5598  "%p to NULL\n",
5599  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5600  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5601  // memory, no need to duplicate
5602  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5603  // memory, no need to duplicate
5604 
5605  if (__kmp_storage_map) {
5606  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5607  }
5608 
5609  /* allocate space for arguments */
5610  __kmp_alloc_argv_entries(argc, team, FALSE);
5611  team->t.t_argc = argc;
5612 
5613  KA_TRACE(20,
5614  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5615  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5616  { // Initialize barrier data.
5617  int b;
5618  for (b = 0; b < bs_last_barrier; ++b) {
5619  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5620 #if USE_DEBUGGER
5621  team->t.t_bar[b].b_master_arrived = 0;
5622  team->t.t_bar[b].b_team_arrived = 0;
5623 #endif
5624  }
5625  }
5626 
5627  team->t.t_proc_bind = new_proc_bind;
5628 
5629 #if OMPT_SUPPORT
5630  __ompt_team_assign_id(team, ompt_parallel_data);
5631  team->t.ompt_serialized_team_info = NULL;
5632 #endif
5633 
5634  KMP_MB();
5635 
5636  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5637  team->t.t_id));
5638 
5639  return team;
5640 }
5641 
5642 /* TODO implement hot-teams at all levels */
5643 /* TODO implement lazy thread release on demand (disband request) */
5644 
5645 /* free the team. return it to the team pool. release all the threads
5646  * associated with it */
5647 void __kmp_free_team(kmp_root_t *root,
5648  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5649  int f;
5650  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5651  team->t.t_id));
5652 
5653  /* verify state */
5654  KMP_DEBUG_ASSERT(root);
5655  KMP_DEBUG_ASSERT(team);
5656  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5657  KMP_DEBUG_ASSERT(team->t.t_threads);
5658 
5659  int use_hot_team = team == root->r.r_hot_team;
5660 #if KMP_NESTED_HOT_TEAMS
5661  int level;
5662  if (master) {
5663  level = team->t.t_active_level - 1;
5664  if (master->th.th_teams_microtask) { // in teams construct?
5665  if (master->th.th_teams_size.nteams > 1) {
5666  ++level; // level was not increased in teams construct for
5667  // team_of_masters
5668  }
5669  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5670  master->th.th_teams_level == team->t.t_level) {
5671  ++level; // level was not increased in teams construct for
5672  // team_of_workers before the parallel
5673  } // team->t.t_level will be increased inside parallel
5674  }
5675 #if KMP_DEBUG
5676  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5677 #endif
5678  if (level < __kmp_hot_teams_max_level) {
5679  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5680  use_hot_team = 1;
5681  }
5682  }
5683 #endif // KMP_NESTED_HOT_TEAMS
5684 
5685  /* team is done working */
5686  TCW_SYNC_PTR(team->t.t_pkfn,
5687  NULL); // Important for Debugging Support Library.
5688 #if KMP_OS_WINDOWS
5689  team->t.t_copyin_counter = 0; // init counter for possible reuse
5690 #endif
5691  // Do not reset pointer to parent team to NULL for hot teams.
5692 
5693  /* if we are non-hot team, release our threads */
5694  if (!use_hot_team) {
5695  if (__kmp_tasking_mode != tskm_immediate_exec) {
5696  // Wait for threads to reach reapable state
5697  for (f = 1; f < team->t.t_nproc; ++f) {
5698  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5699  kmp_info_t *th = team->t.t_threads[f];
5700  volatile kmp_uint32 *state = &th->th.th_reap_state;
5701  while (*state != KMP_SAFE_TO_REAP) {
5702 #if KMP_OS_WINDOWS
5703  // On Windows a thread can be killed at any time, check this
5704  DWORD ecode;
5705  if (!__kmp_is_thread_alive(th, &ecode)) {
5706  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5707  break;
5708  }
5709 #endif
5710  // first check if thread is sleeping
5711  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5712  if (fl.is_sleeping())
5713  fl.resume(__kmp_gtid_from_thread(th));
5714  KMP_CPU_PAUSE();
5715  }
5716  }
5717 
5718  // Delete task teams
5719  int tt_idx;
5720  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5721  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5722  if (task_team != NULL) {
5723  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5724  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5725  team->t.t_threads[f]->th.th_task_team = NULL;
5726  }
5727  KA_TRACE(
5728  20,
5729  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5730  __kmp_get_gtid(), task_team, team->t.t_id));
5731 #if KMP_NESTED_HOT_TEAMS
5732  __kmp_free_task_team(master, task_team);
5733 #endif
5734  team->t.t_task_team[tt_idx] = NULL;
5735  }
5736  }
5737  }
5738 
5739  // Reset pointer to parent team only for non-hot teams.
5740  team->t.t_parent = NULL;
5741  team->t.t_level = 0;
5742  team->t.t_active_level = 0;
5743 
5744  /* free the worker threads */
5745  for (f = 1; f < team->t.t_nproc; ++f) {
5746  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5747  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5748  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5749  1, 2);
5750  }
5751  __kmp_free_thread(team->t.t_threads[f]);
5752  }
5753 
5754  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5755  if (team->t.b) {
5756  // wake up thread at old location
5757  team->t.b->go_release();
5758  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5759  for (f = 1; f < team->t.t_nproc; ++f) {
5760  if (team->t.b->sleep[f].sleep) {
5761  __kmp_atomic_resume_64(
5762  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5763  (kmp_atomic_flag_64<> *)NULL);
5764  }
5765  }
5766  }
5767  // Wait for threads to be removed from team
5768  for (int f = 1; f < team->t.t_nproc; ++f) {
5769  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5770  KMP_CPU_PAUSE();
5771  }
5772  }
5773  }
5774 
5775  for (f = 1; f < team->t.t_nproc; ++f) {
5776  team->t.t_threads[f] = NULL;
5777  }
5778 
5779  if (team->t.t_max_nproc > 1 &&
5780  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5781  distributedBarrier::deallocate(team->t.b);
5782  team->t.b = NULL;
5783  }
5784  /* put the team back in the team pool */
5785  /* TODO limit size of team pool, call reap_team if pool too large */
5786  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5787  __kmp_team_pool = (volatile kmp_team_t *)team;
5788  } else { // Check if team was created for primary threads in teams construct
5789  // See if first worker is a CG root
5790  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5791  team->t.t_threads[1]->th.th_cg_roots);
5792  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5793  // Clean up the CG root nodes on workers so that this team can be re-used
5794  for (f = 1; f < team->t.t_nproc; ++f) {
5795  kmp_info_t *thr = team->t.t_threads[f];
5796  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5797  thr->th.th_cg_roots->cg_root == thr);
5798  // Pop current CG root off list
5799  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5800  thr->th.th_cg_roots = tmp->up;
5801  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5802  " up to node %p. cg_nthreads was %d\n",
5803  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5804  int i = tmp->cg_nthreads--;
5805  if (i == 1) {
5806  __kmp_free(tmp); // free CG if we are the last thread in it
5807  }
5808  // Restore current task's thread_limit from CG root
5809  if (thr->th.th_cg_roots)
5810  thr->th.th_current_task->td_icvs.thread_limit =
5811  thr->th.th_cg_roots->cg_thread_limit;
5812  }
5813  }
5814  }
5815 
5816  KMP_MB();
5817 }
5818 
5819 /* reap the team. destroy it, reclaim all its resources and free its memory */
5820 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5821  kmp_team_t *next_pool = team->t.t_next_pool;
5822 
5823  KMP_DEBUG_ASSERT(team);
5824  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5825  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5826  KMP_DEBUG_ASSERT(team->t.t_threads);
5827  KMP_DEBUG_ASSERT(team->t.t_argv);
5828 
5829  /* TODO clean the threads that are a part of this? */
5830 
5831  /* free stuff */
5832  __kmp_free_team_arrays(team);
5833  if (team->t.t_argv != &team->t.t_inline_argv[0])
5834  __kmp_free((void *)team->t.t_argv);
5835  __kmp_free(team);
5836 
5837  KMP_MB();
5838  return next_pool;
5839 }
5840 
5841 // Free the thread. Don't reap it, just place it on the pool of available
5842 // threads.
5843 //
5844 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5845 // binding for the affinity mechanism to be useful.
5846 //
5847 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5848 // However, we want to avoid a potential performance problem by always
5849 // scanning through the list to find the correct point at which to insert
5850 // the thread (potential N**2 behavior). To do this we keep track of the
5851 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5852 // With single-level parallelism, threads will always be added to the tail
5853 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5854 // parallelism, all bets are off and we may need to scan through the entire
5855 // free list.
5856 //
5857 // This change also has a potentially large performance benefit, for some
5858 // applications. Previously, as threads were freed from the hot team, they
5859 // would be placed back on the free list in inverse order. If the hot team
5860 // grew back to it's original size, then the freed thread would be placed
5861 // back on the hot team in reverse order. This could cause bad cache
5862 // locality problems on programs where the size of the hot team regularly
5863 // grew and shrunk.
5864 //
5865 // Now, for single-level parallelism, the OMP tid is always == gtid.
5866 void __kmp_free_thread(kmp_info_t *this_th) {
5867  int gtid;
5868  kmp_info_t **scan;
5869 
5870  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5871  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5872 
5873  KMP_DEBUG_ASSERT(this_th);
5874 
5875  // When moving thread to pool, switch thread to wait on own b_go flag, and
5876  // uninitialized (NULL team).
5877  int b;
5878  kmp_balign_t *balign = this_th->th.th_bar;
5879  for (b = 0; b < bs_last_barrier; ++b) {
5880  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5881  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5882  balign[b].bb.team = NULL;
5883  balign[b].bb.leaf_kids = 0;
5884  }
5885  this_th->th.th_task_state = 0;
5886  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5887 
5888  /* put thread back on the free pool */
5889  TCW_PTR(this_th->th.th_team, NULL);
5890  TCW_PTR(this_th->th.th_root, NULL);
5891  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5892 
5893  while (this_th->th.th_cg_roots) {
5894  this_th->th.th_cg_roots->cg_nthreads--;
5895  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5896  " %p of thread %p to %d\n",
5897  this_th, this_th->th.th_cg_roots,
5898  this_th->th.th_cg_roots->cg_root,
5899  this_th->th.th_cg_roots->cg_nthreads));
5900  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5901  if (tmp->cg_root == this_th) { // Thread is a cg_root
5902  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5903  KA_TRACE(
5904  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5905  this_th->th.th_cg_roots = tmp->up;
5906  __kmp_free(tmp);
5907  } else { // Worker thread
5908  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5909  __kmp_free(tmp);
5910  }
5911  this_th->th.th_cg_roots = NULL;
5912  break;
5913  }
5914  }
5915 
5916  /* If the implicit task assigned to this thread can be used by other threads
5917  * -> multiple threads can share the data and try to free the task at
5918  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5919  * with higher probability when hot team is disabled but can occurs even when
5920  * the hot team is enabled */
5921  __kmp_free_implicit_task(this_th);
5922  this_th->th.th_current_task = NULL;
5923 
5924  // If the __kmp_thread_pool_insert_pt is already past the new insert
5925  // point, then we need to re-scan the entire list.
5926  gtid = this_th->th.th_info.ds.ds_gtid;
5927  if (__kmp_thread_pool_insert_pt != NULL) {
5928  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5929  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5930  __kmp_thread_pool_insert_pt = NULL;
5931  }
5932  }
5933 
5934  // Scan down the list to find the place to insert the thread.
5935  // scan is the address of a link in the list, possibly the address of
5936  // __kmp_thread_pool itself.
5937  //
5938  // In the absence of nested parallelism, the for loop will have 0 iterations.
5939  if (__kmp_thread_pool_insert_pt != NULL) {
5940  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5941  } else {
5942  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5943  }
5944  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5945  scan = &((*scan)->th.th_next_pool))
5946  ;
5947 
5948  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5949  // to its address.
5950  TCW_PTR(this_th->th.th_next_pool, *scan);
5951  __kmp_thread_pool_insert_pt = *scan = this_th;
5952  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5953  (this_th->th.th_info.ds.ds_gtid <
5954  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5955  TCW_4(this_th->th.th_in_pool, TRUE);
5956  __kmp_suspend_initialize_thread(this_th);
5957  __kmp_lock_suspend_mx(this_th);
5958  if (this_th->th.th_active == TRUE) {
5959  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5960  this_th->th.th_active_in_pool = TRUE;
5961  }
5962 #if KMP_DEBUG
5963  else {
5964  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5965  }
5966 #endif
5967  __kmp_unlock_suspend_mx(this_th);
5968 
5969  TCW_4(__kmp_nth, __kmp_nth - 1);
5970 
5971 #ifdef KMP_ADJUST_BLOCKTIME
5972  /* Adjust blocktime back to user setting or default if necessary */
5973  /* Middle initialization might never have occurred */
5974  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5975  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5976  if (__kmp_nth <= __kmp_avail_proc) {
5977  __kmp_zero_bt = FALSE;
5978  }
5979  }
5980 #endif /* KMP_ADJUST_BLOCKTIME */
5981 
5982  KMP_MB();
5983 }
5984 
5985 /* ------------------------------------------------------------------------ */
5986 
5987 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5988 #if OMP_PROFILING_SUPPORT
5989  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5990  // TODO: add a configuration option for time granularity
5991  if (ProfileTraceFile)
5992  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5993 #endif
5994 
5995  int gtid = this_thr->th.th_info.ds.ds_gtid;
5996  /* void *stack_data;*/
5997  kmp_team_t **volatile pteam;
5998 
5999  KMP_MB();
6000  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6001 
6002  if (__kmp_env_consistency_check) {
6003  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6004  }
6005 
6006 #if OMPD_SUPPORT
6007  if (ompd_state & OMPD_ENABLE_BP)
6008  ompd_bp_thread_begin();
6009 #endif
6010 
6011 #if OMPT_SUPPORT
6012  ompt_data_t *thread_data = nullptr;
6013  if (ompt_enabled.enabled) {
6014  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6015  *thread_data = ompt_data_none;
6016 
6017  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6018  this_thr->th.ompt_thread_info.wait_id = 0;
6019  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6020  this_thr->th.ompt_thread_info.parallel_flags = 0;
6021  if (ompt_enabled.ompt_callback_thread_begin) {
6022  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6023  ompt_thread_worker, thread_data);
6024  }
6025  this_thr->th.ompt_thread_info.state = ompt_state_idle;
6026  }
6027 #endif
6028 
6029  /* This is the place where threads wait for work */
6030  while (!TCR_4(__kmp_global.g.g_done)) {
6031  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6032  KMP_MB();
6033 
6034  /* wait for work to do */
6035  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6036 
6037  /* No tid yet since not part of a team */
6038  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6039 
6040 #if OMPT_SUPPORT
6041  if (ompt_enabled.enabled) {
6042  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6043  }
6044 #endif
6045 
6046  pteam = &this_thr->th.th_team;
6047 
6048  /* have we been allocated? */
6049  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6050  /* we were just woken up, so run our new task */
6051  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6052  int rc;
6053  KA_TRACE(20,
6054  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6055  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6056  (*pteam)->t.t_pkfn));
6057 
6058  updateHWFPControl(*pteam);
6059 
6060 #if OMPT_SUPPORT
6061  if (ompt_enabled.enabled) {
6062  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6063  }
6064 #endif
6065 
6066  rc = (*pteam)->t.t_invoke(gtid);
6067  KMP_ASSERT(rc);
6068 
6069  KMP_MB();
6070  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6071  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6072  (*pteam)->t.t_pkfn));
6073  }
6074 #if OMPT_SUPPORT
6075  if (ompt_enabled.enabled) {
6076  /* no frame set while outside task */
6077  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6078 
6079  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6080  }
6081 #endif
6082  /* join barrier after parallel region */
6083  __kmp_join_barrier(gtid);
6084  }
6085  }
6086 
6087 #if OMPD_SUPPORT
6088  if (ompd_state & OMPD_ENABLE_BP)
6089  ompd_bp_thread_end();
6090 #endif
6091 
6092 #if OMPT_SUPPORT
6093  if (ompt_enabled.ompt_callback_thread_end) {
6094  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6095  }
6096 #endif
6097 
6098  this_thr->th.th_task_team = NULL;
6099  /* run the destructors for the threadprivate data for this thread */
6100  __kmp_common_destroy_gtid(gtid);
6101 
6102  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6103  KMP_MB();
6104 
6105 #if OMP_PROFILING_SUPPORT
6106  llvm::timeTraceProfilerFinishThread();
6107 #endif
6108  return this_thr;
6109 }
6110 
6111 /* ------------------------------------------------------------------------ */
6112 
6113 void __kmp_internal_end_dest(void *specific_gtid) {
6114  // Make sure no significant bits are lost
6115  int gtid;
6116  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6117 
6118  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6119  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6120  * this is because 0 is reserved for the nothing-stored case */
6121 
6122  __kmp_internal_end_thread(gtid);
6123 }
6124 
6125 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6126 
6127 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6128  __kmp_internal_end_atexit();
6129 }
6130 
6131 #endif
6132 
6133 /* [Windows] josh: when the atexit handler is called, there may still be more
6134  than one thread alive */
6135 void __kmp_internal_end_atexit(void) {
6136  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6137  /* [Windows]
6138  josh: ideally, we want to completely shutdown the library in this atexit
6139  handler, but stat code that depends on thread specific data for gtid fails
6140  because that data becomes unavailable at some point during the shutdown, so
6141  we call __kmp_internal_end_thread instead. We should eventually remove the
6142  dependency on __kmp_get_specific_gtid in the stat code and use
6143  __kmp_internal_end_library to cleanly shutdown the library.
6144 
6145  // TODO: Can some of this comment about GVS be removed?
6146  I suspect that the offending stat code is executed when the calling thread
6147  tries to clean up a dead root thread's data structures, resulting in GVS
6148  code trying to close the GVS structures for that thread, but since the stat
6149  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6150  the calling thread is cleaning up itself instead of another thread, it get
6151  confused. This happens because allowing a thread to unregister and cleanup
6152  another thread is a recent modification for addressing an issue.
6153  Based on the current design (20050722), a thread may end up
6154  trying to unregister another thread only if thread death does not trigger
6155  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6156  thread specific data destructor function to detect thread death. For
6157  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6158  is nothing. Thus, the workaround is applicable only for Windows static
6159  stat library. */
6160  __kmp_internal_end_library(-1);
6161 #if KMP_OS_WINDOWS
6162  __kmp_close_console();
6163 #endif
6164 }
6165 
6166 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6167  // It is assumed __kmp_forkjoin_lock is acquired.
6168 
6169  int gtid;
6170 
6171  KMP_DEBUG_ASSERT(thread != NULL);
6172 
6173  gtid = thread->th.th_info.ds.ds_gtid;
6174 
6175  if (!is_root) {
6176  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6177  /* Assume the threads are at the fork barrier here */
6178  KA_TRACE(
6179  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6180  gtid));
6181  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6182  while (
6183  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6184  KMP_CPU_PAUSE();
6185  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6186  } else {
6187  /* Need release fence here to prevent seg faults for tree forkjoin
6188  barrier (GEH) */
6189  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6190  thread);
6191  __kmp_release_64(&flag);
6192  }
6193  }
6194 
6195  // Terminate OS thread.
6196  __kmp_reap_worker(thread);
6197 
6198  // The thread was killed asynchronously. If it was actively
6199  // spinning in the thread pool, decrement the global count.
6200  //
6201  // There is a small timing hole here - if the worker thread was just waking
6202  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6203  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6204  // the global counter might not get updated.
6205  //
6206  // Currently, this can only happen as the library is unloaded,
6207  // so there are no harmful side effects.
6208  if (thread->th.th_active_in_pool) {
6209  thread->th.th_active_in_pool = FALSE;
6210  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6211  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6212  }
6213  }
6214 
6215  __kmp_free_implicit_task(thread);
6216 
6217 // Free the fast memory for tasking
6218 #if USE_FAST_MEMORY
6219  __kmp_free_fast_memory(thread);
6220 #endif /* USE_FAST_MEMORY */
6221 
6222  __kmp_suspend_uninitialize_thread(thread);
6223 
6224  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6225  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6226 
6227  --__kmp_all_nth;
6228  // __kmp_nth was decremented when thread is added to the pool.
6229 
6230 #ifdef KMP_ADJUST_BLOCKTIME
6231  /* Adjust blocktime back to user setting or default if necessary */
6232  /* Middle initialization might never have occurred */
6233  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6234  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6235  if (__kmp_nth <= __kmp_avail_proc) {
6236  __kmp_zero_bt = FALSE;
6237  }
6238  }
6239 #endif /* KMP_ADJUST_BLOCKTIME */
6240 
6241  /* free the memory being used */
6242  if (__kmp_env_consistency_check) {
6243  if (thread->th.th_cons) {
6244  __kmp_free_cons_stack(thread->th.th_cons);
6245  thread->th.th_cons = NULL;
6246  }
6247  }
6248 
6249  if (thread->th.th_pri_common != NULL) {
6250  __kmp_free(thread->th.th_pri_common);
6251  thread->th.th_pri_common = NULL;
6252  }
6253 
6254  if (thread->th.th_task_state_memo_stack != NULL) {
6255  __kmp_free(thread->th.th_task_state_memo_stack);
6256  thread->th.th_task_state_memo_stack = NULL;
6257  }
6258 
6259 #if KMP_USE_BGET
6260  if (thread->th.th_local.bget_data != NULL) {
6261  __kmp_finalize_bget(thread);
6262  }
6263 #endif
6264 
6265 #if KMP_AFFINITY_SUPPORTED
6266  if (thread->th.th_affin_mask != NULL) {
6267  KMP_CPU_FREE(thread->th.th_affin_mask);
6268  thread->th.th_affin_mask = NULL;
6269  }
6270 #endif /* KMP_AFFINITY_SUPPORTED */
6271 
6272 #if KMP_USE_HIER_SCHED
6273  if (thread->th.th_hier_bar_data != NULL) {
6274  __kmp_free(thread->th.th_hier_bar_data);
6275  thread->th.th_hier_bar_data = NULL;
6276  }
6277 #endif
6278 
6279  __kmp_reap_team(thread->th.th_serial_team);
6280  thread->th.th_serial_team = NULL;
6281  __kmp_free(thread);
6282 
6283  KMP_MB();
6284 
6285 } // __kmp_reap_thread
6286 
6287 static void __kmp_itthash_clean(kmp_info_t *th) {
6288 #if USE_ITT_NOTIFY
6289  if (__kmp_itt_region_domains.count > 0) {
6290  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6291  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6292  while (bucket) {
6293  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6294  __kmp_thread_free(th, bucket);
6295  bucket = next;
6296  }
6297  }
6298  }
6299  if (__kmp_itt_barrier_domains.count > 0) {
6300  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6301  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6302  while (bucket) {
6303  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6304  __kmp_thread_free(th, bucket);
6305  bucket = next;
6306  }
6307  }
6308  }
6309 #endif
6310 }
6311 
6312 static void __kmp_internal_end(void) {
6313  int i;
6314 
6315  /* First, unregister the library */
6316  __kmp_unregister_library();
6317 
6318 #if KMP_OS_WINDOWS
6319  /* In Win static library, we can't tell when a root actually dies, so we
6320  reclaim the data structures for any root threads that have died but not
6321  unregistered themselves, in order to shut down cleanly.
6322  In Win dynamic library we also can't tell when a thread dies. */
6323  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6324 // dead roots
6325 #endif
6326 
6327  for (i = 0; i < __kmp_threads_capacity; i++)
6328  if (__kmp_root[i])
6329  if (__kmp_root[i]->r.r_active)
6330  break;
6331  KMP_MB(); /* Flush all pending memory write invalidates. */
6332  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6333 
6334  if (i < __kmp_threads_capacity) {
6335 #if KMP_USE_MONITOR
6336  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6337  KMP_MB(); /* Flush all pending memory write invalidates. */
6338 
6339  // Need to check that monitor was initialized before reaping it. If we are
6340  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6341  // __kmp_monitor will appear to contain valid data, but it is only valid in
6342  // the parent process, not the child.
6343  // New behavior (201008): instead of keying off of the flag
6344  // __kmp_init_parallel, the monitor thread creation is keyed off
6345  // of the new flag __kmp_init_monitor.
6346  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6347  if (TCR_4(__kmp_init_monitor)) {
6348  __kmp_reap_monitor(&__kmp_monitor);
6349  TCW_4(__kmp_init_monitor, 0);
6350  }
6351  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6352  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6353 #endif // KMP_USE_MONITOR
6354  } else {
6355 /* TODO move this to cleanup code */
6356 #ifdef KMP_DEBUG
6357  /* make sure that everything has properly ended */
6358  for (i = 0; i < __kmp_threads_capacity; i++) {
6359  if (__kmp_root[i]) {
6360  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6361  // there can be uber threads alive here
6362  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6363  }
6364  }
6365 #endif
6366 
6367  KMP_MB();
6368 
6369  // Reap the worker threads.
6370  // This is valid for now, but be careful if threads are reaped sooner.
6371  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6372  // Get the next thread from the pool.
6373  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6374  __kmp_thread_pool = thread->th.th_next_pool;
6375  // Reap it.
6376  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6377  thread->th.th_next_pool = NULL;
6378  thread->th.th_in_pool = FALSE;
6379  __kmp_reap_thread(thread, 0);
6380  }
6381  __kmp_thread_pool_insert_pt = NULL;
6382 
6383  // Reap teams.
6384  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6385  // Get the next team from the pool.
6386  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6387  __kmp_team_pool = team->t.t_next_pool;
6388  // Reap it.
6389  team->t.t_next_pool = NULL;
6390  __kmp_reap_team(team);
6391  }
6392 
6393  __kmp_reap_task_teams();
6394 
6395 #if KMP_OS_UNIX
6396  // Threads that are not reaped should not access any resources since they
6397  // are going to be deallocated soon, so the shutdown sequence should wait
6398  // until all threads either exit the final spin-waiting loop or begin
6399  // sleeping after the given blocktime.
6400  for (i = 0; i < __kmp_threads_capacity; i++) {
6401  kmp_info_t *thr = __kmp_threads[i];
6402  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6403  KMP_CPU_PAUSE();
6404  }
6405 #endif
6406 
6407  for (i = 0; i < __kmp_threads_capacity; ++i) {
6408  // TBD: Add some checking...
6409  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6410  }
6411 
6412  /* Make sure all threadprivate destructors get run by joining with all
6413  worker threads before resetting this flag */
6414  TCW_SYNC_4(__kmp_init_common, FALSE);
6415 
6416  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6417  KMP_MB();
6418 
6419 #if KMP_USE_MONITOR
6420  // See note above: One of the possible fixes for CQ138434 / CQ140126
6421  //
6422  // FIXME: push both code fragments down and CSE them?
6423  // push them into __kmp_cleanup() ?
6424  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6425  if (TCR_4(__kmp_init_monitor)) {
6426  __kmp_reap_monitor(&__kmp_monitor);
6427  TCW_4(__kmp_init_monitor, 0);
6428  }
6429  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6430  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6431 #endif
6432  } /* else !__kmp_global.t_active */
6433  TCW_4(__kmp_init_gtid, FALSE);
6434  KMP_MB(); /* Flush all pending memory write invalidates. */
6435 
6436  __kmp_cleanup();
6437 #if OMPT_SUPPORT
6438  ompt_fini();
6439 #endif
6440 }
6441 
6442 void __kmp_internal_end_library(int gtid_req) {
6443  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6444  /* this shouldn't be a race condition because __kmp_internal_end() is the
6445  only place to clear __kmp_serial_init */
6446  /* we'll check this later too, after we get the lock */
6447  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6448  // redundant, because the next check will work in any case.
6449  if (__kmp_global.g.g_abort) {
6450  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6451  /* TODO abort? */
6452  return;
6453  }
6454  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6455  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6456  return;
6457  }
6458 
6459  // If hidden helper team has been initialized, we need to deinit it
6460  if (TCR_4(__kmp_init_hidden_helper) &&
6461  !TCR_4(__kmp_hidden_helper_team_done)) {
6462  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6463  // First release the main thread to let it continue its work
6464  __kmp_hidden_helper_main_thread_release();
6465  // Wait until the hidden helper team has been destroyed
6466  __kmp_hidden_helper_threads_deinitz_wait();
6467  }
6468 
6469  KMP_MB(); /* Flush all pending memory write invalidates. */
6470  /* find out who we are and what we should do */
6471  {
6472  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6473  KA_TRACE(
6474  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6475  if (gtid == KMP_GTID_SHUTDOWN) {
6476  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6477  "already shutdown\n"));
6478  return;
6479  } else if (gtid == KMP_GTID_MONITOR) {
6480  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6481  "registered, or system shutdown\n"));
6482  return;
6483  } else if (gtid == KMP_GTID_DNE) {
6484  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6485  "shutdown\n"));
6486  /* we don't know who we are, but we may still shutdown the library */
6487  } else if (KMP_UBER_GTID(gtid)) {
6488  /* unregister ourselves as an uber thread. gtid is no longer valid */
6489  if (__kmp_root[gtid]->r.r_active) {
6490  __kmp_global.g.g_abort = -1;
6491  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6492  __kmp_unregister_library();
6493  KA_TRACE(10,
6494  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6495  gtid));
6496  return;
6497  } else {
6498  __kmp_itthash_clean(__kmp_threads[gtid]);
6499  KA_TRACE(
6500  10,
6501  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6502  __kmp_unregister_root_current_thread(gtid);
6503  }
6504  } else {
6505 /* worker threads may call this function through the atexit handler, if they
6506  * call exit() */
6507 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6508  TODO: do a thorough shutdown instead */
6509 #ifdef DUMP_DEBUG_ON_EXIT
6510  if (__kmp_debug_buf)
6511  __kmp_dump_debug_buffer();
6512 #endif
6513  // added unregister library call here when we switch to shm linux
6514  // if we don't, it will leave lots of files in /dev/shm
6515  // cleanup shared memory file before exiting.
6516  __kmp_unregister_library();
6517  return;
6518  }
6519  }
6520  /* synchronize the termination process */
6521  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6522 
6523  /* have we already finished */
6524  if (__kmp_global.g.g_abort) {
6525  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6526  /* TODO abort? */
6527  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6528  return;
6529  }
6530  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6531  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6532  return;
6533  }
6534 
6535  /* We need this lock to enforce mutex between this reading of
6536  __kmp_threads_capacity and the writing by __kmp_register_root.
6537  Alternatively, we can use a counter of roots that is atomically updated by
6538  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6539  __kmp_internal_end_*. */
6540  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6541 
6542  /* now we can safely conduct the actual termination */
6543  __kmp_internal_end();
6544 
6545  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6546  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6547 
6548  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6549 
6550 #ifdef DUMP_DEBUG_ON_EXIT
6551  if (__kmp_debug_buf)
6552  __kmp_dump_debug_buffer();
6553 #endif
6554 
6555 #if KMP_OS_WINDOWS
6556  __kmp_close_console();
6557 #endif
6558 
6559  __kmp_fini_allocator();
6560 
6561 } // __kmp_internal_end_library
6562 
6563 void __kmp_internal_end_thread(int gtid_req) {
6564  int i;
6565 
6566  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6567  /* this shouldn't be a race condition because __kmp_internal_end() is the
6568  * only place to clear __kmp_serial_init */
6569  /* we'll check this later too, after we get the lock */
6570  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6571  // redundant, because the next check will work in any case.
6572  if (__kmp_global.g.g_abort) {
6573  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6574  /* TODO abort? */
6575  return;
6576  }
6577  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6578  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6579  return;
6580  }
6581 
6582  // If hidden helper team has been initialized, we need to deinit it
6583  if (TCR_4(__kmp_init_hidden_helper) &&
6584  !TCR_4(__kmp_hidden_helper_team_done)) {
6585  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6586  // First release the main thread to let it continue its work
6587  __kmp_hidden_helper_main_thread_release();
6588  // Wait until the hidden helper team has been destroyed
6589  __kmp_hidden_helper_threads_deinitz_wait();
6590  }
6591 
6592  KMP_MB(); /* Flush all pending memory write invalidates. */
6593 
6594  /* find out who we are and what we should do */
6595  {
6596  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6597  KA_TRACE(10,
6598  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6599  if (gtid == KMP_GTID_SHUTDOWN) {
6600  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6601  "already shutdown\n"));
6602  return;
6603  } else if (gtid == KMP_GTID_MONITOR) {
6604  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6605  "registered, or system shutdown\n"));
6606  return;
6607  } else if (gtid == KMP_GTID_DNE) {
6608  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6609  "shutdown\n"));
6610  return;
6611  /* we don't know who we are */
6612  } else if (KMP_UBER_GTID(gtid)) {
6613  /* unregister ourselves as an uber thread. gtid is no longer valid */
6614  if (__kmp_root[gtid]->r.r_active) {
6615  __kmp_global.g.g_abort = -1;
6616  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6617  KA_TRACE(10,
6618  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6619  gtid));
6620  return;
6621  } else {
6622  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6623  gtid));
6624  __kmp_unregister_root_current_thread(gtid);
6625  }
6626  } else {
6627  /* just a worker thread, let's leave */
6628  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6629 
6630  if (gtid >= 0) {
6631  __kmp_threads[gtid]->th.th_task_team = NULL;
6632  }
6633 
6634  KA_TRACE(10,
6635  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6636  gtid));
6637  return;
6638  }
6639  }
6640 #if KMP_DYNAMIC_LIB
6641  if (__kmp_pause_status != kmp_hard_paused)
6642  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6643  // because we will better shutdown later in the library destructor.
6644  {
6645  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6646  return;
6647  }
6648 #endif
6649  /* synchronize the termination process */
6650  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6651 
6652  /* have we already finished */
6653  if (__kmp_global.g.g_abort) {
6654  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6655  /* TODO abort? */
6656  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6657  return;
6658  }
6659  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6660  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6661  return;
6662  }
6663 
6664  /* We need this lock to enforce mutex between this reading of
6665  __kmp_threads_capacity and the writing by __kmp_register_root.
6666  Alternatively, we can use a counter of roots that is atomically updated by
6667  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6668  __kmp_internal_end_*. */
6669 
6670  /* should we finish the run-time? are all siblings done? */
6671  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6672 
6673  for (i = 0; i < __kmp_threads_capacity; ++i) {
6674  if (KMP_UBER_GTID(i)) {
6675  KA_TRACE(
6676  10,
6677  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6678  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6679  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6680  return;
6681  }
6682  }
6683 
6684  /* now we can safely conduct the actual termination */
6685 
6686  __kmp_internal_end();
6687 
6688  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6689  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6690 
6691  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6692 
6693 #ifdef DUMP_DEBUG_ON_EXIT
6694  if (__kmp_debug_buf)
6695  __kmp_dump_debug_buffer();
6696 #endif
6697 } // __kmp_internal_end_thread
6698 
6699 // -----------------------------------------------------------------------------
6700 // Library registration stuff.
6701 
6702 static long __kmp_registration_flag = 0;
6703 // Random value used to indicate library initialization.
6704 static char *__kmp_registration_str = NULL;
6705 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6706 
6707 static inline char *__kmp_reg_status_name() {
6708 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6709  each thread. If registration and unregistration go in different threads
6710  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6711  env var can not be found, because the name will contain different pid. */
6712 // macOS* complains about name being too long with additional getuid()
6713 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6714  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6715  (int)getuid());
6716 #else
6717  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6718 #endif
6719 } // __kmp_reg_status_get
6720 
6721 #if defined(KMP_USE_SHM)
6722 bool __kmp_shm_available = false;
6723 bool __kmp_tmp_available = false;
6724 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6725 char *temp_reg_status_file_name = nullptr;
6726 #endif
6727 
6728 void __kmp_register_library_startup(void) {
6729 
6730  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6731  int done = 0;
6732  union {
6733  double dtime;
6734  long ltime;
6735  } time;
6736 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6737  __kmp_initialize_system_tick();
6738 #endif
6739  __kmp_read_system_time(&time.dtime);
6740  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6741  __kmp_registration_str =
6742  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6743  __kmp_registration_flag, KMP_LIBRARY_FILE);
6744 
6745  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6746  __kmp_registration_str));
6747 
6748  while (!done) {
6749 
6750  char *value = NULL; // Actual value of the environment variable.
6751 
6752 #if defined(KMP_USE_SHM)
6753  char *shm_name = nullptr;
6754  char *data1 = nullptr;
6755  __kmp_shm_available = __kmp_detect_shm();
6756  if (__kmp_shm_available) {
6757  int fd1 = -1;
6758  shm_name = __kmp_str_format("/%s", name);
6759  int shm_preexist = 0;
6760  fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6761  if ((fd1 == -1) && (errno == EEXIST)) {
6762  // file didn't open because it already exists.
6763  // try opening existing file
6764  fd1 = shm_open(shm_name, O_RDWR, 0666);
6765  if (fd1 == -1) { // file didn't open
6766  KMP_WARNING(FunctionError, "Can't open SHM");
6767  __kmp_shm_available = false;
6768  } else { // able to open existing file
6769  shm_preexist = 1;
6770  }
6771  }
6772  if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6773  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6774  KMP_WARNING(FunctionError, "Can't set size of SHM");
6775  __kmp_shm_available = false;
6776  }
6777  }
6778  if (__kmp_shm_available) { // SHM exists, now map it
6779  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6780  fd1, 0);
6781  if (data1 == MAP_FAILED) { // failed to map shared memory
6782  KMP_WARNING(FunctionError, "Can't map SHM");
6783  __kmp_shm_available = false;
6784  }
6785  }
6786  if (__kmp_shm_available) { // SHM mapped
6787  if (shm_preexist == 0) { // set data to SHM, set value
6788  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6789  }
6790  // Read value from either what we just wrote or existing file.
6791  value = __kmp_str_format("%s", data1); // read value from SHM
6792  munmap(data1, SHM_SIZE);
6793  }
6794  if (fd1 != -1)
6795  close(fd1);
6796  }
6797  if (!__kmp_shm_available)
6798  __kmp_tmp_available = __kmp_detect_tmp();
6799  if (!__kmp_shm_available && __kmp_tmp_available) {
6800  // SHM failed to work due to an error other than that the file already
6801  // exists. Try to create a temp file under /tmp.
6802  // If /tmp isn't accessible, fall back to using environment variable.
6803  // TODO: /tmp might not always be the temporary directory. For now we will
6804  // not consider TMPDIR.
6805  int fd1 = -1;
6806  temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6807  int tmp_preexist = 0;
6808  fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6809  if ((fd1 == -1) && (errno == EEXIST)) {
6810  // file didn't open because it already exists.
6811  // try opening existing file
6812  fd1 = open(temp_reg_status_file_name, O_RDWR, 0666);
6813  if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6814  KMP_WARNING(FunctionError, "Can't open TEMP");
6815  __kmp_tmp_available = false;
6816  } else {
6817  tmp_preexist = 1;
6818  }
6819  }
6820  if (__kmp_tmp_available && tmp_preexist == 0) {
6821  // we created /tmp file now set size
6822  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6823  KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6824  __kmp_tmp_available = false;
6825  }
6826  }
6827  if (__kmp_tmp_available) {
6828  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6829  fd1, 0);
6830  if (data1 == MAP_FAILED) { // failed to map /tmp
6831  KMP_WARNING(FunctionError, "Can't map /tmp");
6832  __kmp_tmp_available = false;
6833  }
6834  }
6835  if (__kmp_tmp_available) {
6836  if (tmp_preexist == 0) { // set data to TMP, set value
6837  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6838  }
6839  // Read value from either what we just wrote or existing file.
6840  value = __kmp_str_format("%s", data1); // read value from SHM
6841  munmap(data1, SHM_SIZE);
6842  }
6843  if (fd1 != -1)
6844  close(fd1);
6845  }
6846  if (!__kmp_shm_available && !__kmp_tmp_available) {
6847  // no /dev/shm and no /tmp -- fall back to environment variable
6848  // Set environment variable, but do not overwrite if it exists.
6849  __kmp_env_set(name, __kmp_registration_str, 0);
6850  // read value to see if it got set
6851  value = __kmp_env_get(name);
6852  }
6853 #else // Windows and unix with static library
6854  // Set environment variable, but do not overwrite if it exists.
6855  __kmp_env_set(name, __kmp_registration_str, 0);
6856  // read value to see if it got set
6857  value = __kmp_env_get(name);
6858 #endif
6859 
6860  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6861  done = 1; // Ok, environment variable set successfully, exit the loop.
6862  } else {
6863  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6864  // Check whether it alive or dead.
6865  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6866  char *tail = value;
6867  char *flag_addr_str = NULL;
6868  char *flag_val_str = NULL;
6869  char const *file_name = NULL;
6870  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6871  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6872  file_name = tail;
6873  if (tail != NULL) {
6874  unsigned long *flag_addr = 0;
6875  unsigned long flag_val = 0;
6876  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6877  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6878  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6879  // First, check whether environment-encoded address is mapped into
6880  // addr space.
6881  // If so, dereference it to see if it still has the right value.
6882  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6883  neighbor = 1;
6884  } else {
6885  // If not, then we know the other copy of the library is no longer
6886  // running.
6887  neighbor = 2;
6888  }
6889  }
6890  }
6891  switch (neighbor) {
6892  case 0: // Cannot parse environment variable -- neighbor status unknown.
6893  // Assume it is the incompatible format of future version of the
6894  // library. Assume the other library is alive.
6895  // WARN( ... ); // TODO: Issue a warning.
6896  file_name = "unknown library";
6897  KMP_FALLTHROUGH();
6898  // Attention! Falling to the next case. That's intentional.
6899  case 1: { // Neighbor is alive.
6900  // Check it is allowed.
6901  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6902  if (!__kmp_str_match_true(duplicate_ok)) {
6903  // That's not allowed. Issue fatal error.
6904  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6905  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6906  }
6907  KMP_INTERNAL_FREE(duplicate_ok);
6908  __kmp_duplicate_library_ok = 1;
6909  done = 1; // Exit the loop.
6910  } break;
6911  case 2: { // Neighbor is dead.
6912 
6913 #if defined(KMP_USE_SHM)
6914  if (__kmp_shm_available) { // close shared memory.
6915  shm_unlink(shm_name); // this removes file in /dev/shm
6916  } else if (__kmp_tmp_available) {
6917  unlink(temp_reg_status_file_name); // this removes the temp file
6918  } else {
6919  // Clear the variable and try to register library again.
6920  __kmp_env_unset(name);
6921  }
6922 #else
6923  // Clear the variable and try to register library again.
6924  __kmp_env_unset(name);
6925 #endif
6926  } break;
6927  default: {
6928  KMP_DEBUG_ASSERT(0);
6929  } break;
6930  }
6931  }
6932  KMP_INTERNAL_FREE((void *)value);
6933 #if defined(KMP_USE_SHM)
6934  if (shm_name)
6935  KMP_INTERNAL_FREE((void *)shm_name);
6936 #endif
6937  } // while
6938  KMP_INTERNAL_FREE((void *)name);
6939 
6940 } // func __kmp_register_library_startup
6941 
6942 void __kmp_unregister_library(void) {
6943 
6944  char *name = __kmp_reg_status_name();
6945  char *value = NULL;
6946 
6947 #if defined(KMP_USE_SHM)
6948  char *shm_name = nullptr;
6949  int fd1;
6950  if (__kmp_shm_available) {
6951  shm_name = __kmp_str_format("/%s", name);
6952  fd1 = shm_open(shm_name, O_RDONLY, 0666);
6953  if (fd1 != -1) { // File opened successfully
6954  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6955  if (data1 != MAP_FAILED) {
6956  value = __kmp_str_format("%s", data1); // read value from SHM
6957  munmap(data1, SHM_SIZE);
6958  }
6959  close(fd1);
6960  }
6961  } else if (__kmp_tmp_available) { // try /tmp
6962  fd1 = open(temp_reg_status_file_name, O_RDONLY);
6963  if (fd1 != -1) { // File opened successfully
6964  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6965  if (data1 != MAP_FAILED) {
6966  value = __kmp_str_format("%s", data1); // read value from /tmp
6967  munmap(data1, SHM_SIZE);
6968  }
6969  close(fd1);
6970  }
6971  } else { // fall back to envirable
6972  value = __kmp_env_get(name);
6973  }
6974 #else
6975  value = __kmp_env_get(name);
6976 #endif
6977 
6978  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6979  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6980  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6981 // Ok, this is our variable. Delete it.
6982 #if defined(KMP_USE_SHM)
6983  if (__kmp_shm_available) {
6984  shm_unlink(shm_name); // this removes file in /dev/shm
6985  } else if (__kmp_tmp_available) {
6986  unlink(temp_reg_status_file_name); // this removes the temp file
6987  } else {
6988  __kmp_env_unset(name);
6989  }
6990 #else
6991  __kmp_env_unset(name);
6992 #endif
6993  }
6994 
6995 #if defined(KMP_USE_SHM)
6996  if (shm_name)
6997  KMP_INTERNAL_FREE(shm_name);
6998  if (temp_reg_status_file_name)
6999  KMP_INTERNAL_FREE(temp_reg_status_file_name);
7000 #endif
7001 
7002  KMP_INTERNAL_FREE(__kmp_registration_str);
7003  KMP_INTERNAL_FREE(value);
7004  KMP_INTERNAL_FREE(name);
7005 
7006  __kmp_registration_flag = 0;
7007  __kmp_registration_str = NULL;
7008 
7009 } // __kmp_unregister_library
7010 
7011 // End of Library registration stuff.
7012 // -----------------------------------------------------------------------------
7013 
7014 #if KMP_MIC_SUPPORTED
7015 
7016 static void __kmp_check_mic_type() {
7017  kmp_cpuid_t cpuid_state = {0};
7018  kmp_cpuid_t *cs_p = &cpuid_state;
7019  __kmp_x86_cpuid(1, 0, cs_p);
7020  // We don't support mic1 at the moment
7021  if ((cs_p->eax & 0xff0) == 0xB10) {
7022  __kmp_mic_type = mic2;
7023  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7024  __kmp_mic_type = mic3;
7025  } else {
7026  __kmp_mic_type = non_mic;
7027  }
7028 }
7029 
7030 #endif /* KMP_MIC_SUPPORTED */
7031 
7032 #if KMP_HAVE_UMWAIT
7033 static void __kmp_user_level_mwait_init() {
7034  struct kmp_cpuid buf;
7035  __kmp_x86_cpuid(7, 0, &buf);
7036  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7037  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7038  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7039  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7040  __kmp_umwait_enabled));
7041 }
7042 #elif KMP_HAVE_MWAIT
7043 #ifndef AT_INTELPHIUSERMWAIT
7044 // Spurious, non-existent value that should always fail to return anything.
7045 // Will be replaced with the correct value when we know that.
7046 #define AT_INTELPHIUSERMWAIT 10000
7047 #endif
7048 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7049 // earlier OS is used to build the RTL, we'll use the following internal
7050 // function when the entry is not found.
7051 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7052 unsigned long getauxval(unsigned long) { return 0; }
7053 
7054 static void __kmp_user_level_mwait_init() {
7055  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7056  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7057  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7058  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7059  if (__kmp_mic_type == mic3) {
7060  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7061  if ((res & 0x1) || __kmp_user_level_mwait) {
7062  __kmp_mwait_enabled = TRUE;
7063  if (__kmp_user_level_mwait) {
7064  KMP_INFORM(EnvMwaitWarn);
7065  }
7066  } else {
7067  __kmp_mwait_enabled = FALSE;
7068  }
7069  }
7070  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7071  "__kmp_mwait_enabled = %d\n",
7072  __kmp_mic_type, __kmp_mwait_enabled));
7073 }
7074 #endif /* KMP_HAVE_UMWAIT */
7075 
7076 static void __kmp_do_serial_initialize(void) {
7077  int i, gtid;
7078  size_t size;
7079 
7080  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7081 
7082  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7083  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7084  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7085  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7086  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7087 
7088 #if OMPT_SUPPORT
7089  ompt_pre_init();
7090 #endif
7091 #if OMPD_SUPPORT
7092  __kmp_env_dump();
7093  ompd_init();
7094 #endif
7095 
7096  __kmp_validate_locks();
7097 
7098 #if ENABLE_LIBOMPTARGET
7099  /* Initialize functions from libomptarget */
7100  __kmp_init_omptarget();
7101 #endif
7102 
7103  /* Initialize internal memory allocator */
7104  __kmp_init_allocator();
7105 
7106  /* Register the library startup via an environment variable or via mapped
7107  shared memory file and check to see whether another copy of the library is
7108  already registered. Since forked child process is often terminated, we
7109  postpone the registration till middle initialization in the child */
7110  if (__kmp_need_register_serial)
7111  __kmp_register_library_startup();
7112 
7113  /* TODO reinitialization of library */
7114  if (TCR_4(__kmp_global.g.g_done)) {
7115  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7116  }
7117 
7118  __kmp_global.g.g_abort = 0;
7119  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7120 
7121 /* initialize the locks */
7122 #if KMP_USE_ADAPTIVE_LOCKS
7123 #if KMP_DEBUG_ADAPTIVE_LOCKS
7124  __kmp_init_speculative_stats();
7125 #endif
7126 #endif
7127 #if KMP_STATS_ENABLED
7128  __kmp_stats_init();
7129 #endif
7130  __kmp_init_lock(&__kmp_global_lock);
7131  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7132  __kmp_init_lock(&__kmp_debug_lock);
7133  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7134  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7135  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7136  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7137  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7138  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7139  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7140  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7141  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7142  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7143  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7144  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7145  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7146  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7147  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7148 #if KMP_USE_MONITOR
7149  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7150 #endif
7151  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7152 
7153  /* conduct initialization and initial setup of configuration */
7154 
7155  __kmp_runtime_initialize();
7156 
7157 #if KMP_MIC_SUPPORTED
7158  __kmp_check_mic_type();
7159 #endif
7160 
7161 // Some global variable initialization moved here from kmp_env_initialize()
7162 #ifdef KMP_DEBUG
7163  kmp_diag = 0;
7164 #endif
7165  __kmp_abort_delay = 0;
7166 
7167  // From __kmp_init_dflt_team_nth()
7168  /* assume the entire machine will be used */
7169  __kmp_dflt_team_nth_ub = __kmp_xproc;
7170  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7171  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7172  }
7173  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7174  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7175  }
7176  __kmp_max_nth = __kmp_sys_max_nth;
7177  __kmp_cg_max_nth = __kmp_sys_max_nth;
7178  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7179  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7180  __kmp_teams_max_nth = __kmp_sys_max_nth;
7181  }
7182 
7183  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7184  // part
7185  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7186 #if KMP_USE_MONITOR
7187  __kmp_monitor_wakeups =
7188  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7189  __kmp_bt_intervals =
7190  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7191 #endif
7192  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7193  __kmp_library = library_throughput;
7194  // From KMP_SCHEDULE initialization
7195  __kmp_static = kmp_sch_static_balanced;
7196 // AC: do not use analytical here, because it is non-monotonous
7197 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7198 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7199 // need to repeat assignment
7200 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7201 // bit control and barrier method control parts
7202 #if KMP_FAST_REDUCTION_BARRIER
7203 #define kmp_reduction_barrier_gather_bb ((int)1)
7204 #define kmp_reduction_barrier_release_bb ((int)1)
7205 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7206 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7207 #endif // KMP_FAST_REDUCTION_BARRIER
7208  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7209  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7210  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7211  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7212  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7213 #if KMP_FAST_REDUCTION_BARRIER
7214  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7215  // lin_64 ): hyper,1
7216  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7217  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7218  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7219  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7220  }
7221 #endif // KMP_FAST_REDUCTION_BARRIER
7222  }
7223 #if KMP_FAST_REDUCTION_BARRIER
7224 #undef kmp_reduction_barrier_release_pat
7225 #undef kmp_reduction_barrier_gather_pat
7226 #undef kmp_reduction_barrier_release_bb
7227 #undef kmp_reduction_barrier_gather_bb
7228 #endif // KMP_FAST_REDUCTION_BARRIER
7229 #if KMP_MIC_SUPPORTED
7230  if (__kmp_mic_type == mic2) { // KNC
7231  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7232  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7233  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7234  1; // forkjoin release
7235  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7236  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7237  }
7238 #if KMP_FAST_REDUCTION_BARRIER
7239  if (__kmp_mic_type == mic2) { // KNC
7240  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7241  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7242  }
7243 #endif // KMP_FAST_REDUCTION_BARRIER
7244 #endif // KMP_MIC_SUPPORTED
7245 
7246 // From KMP_CHECKS initialization
7247 #ifdef KMP_DEBUG
7248  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7249 #else
7250  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7251 #endif
7252 
7253  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7254  __kmp_foreign_tp = TRUE;
7255 
7256  __kmp_global.g.g_dynamic = FALSE;
7257  __kmp_global.g.g_dynamic_mode = dynamic_default;
7258 
7259  __kmp_init_nesting_mode();
7260 
7261  __kmp_env_initialize(NULL);
7262 
7263 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7264  __kmp_user_level_mwait_init();
7265 #endif
7266 // Print all messages in message catalog for testing purposes.
7267 #ifdef KMP_DEBUG
7268  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7269  if (__kmp_str_match_true(val)) {
7270  kmp_str_buf_t buffer;
7271  __kmp_str_buf_init(&buffer);
7272  __kmp_i18n_dump_catalog(&buffer);
7273  __kmp_printf("%s", buffer.str);
7274  __kmp_str_buf_free(&buffer);
7275  }
7276  __kmp_env_free(&val);
7277 #endif
7278 
7279  __kmp_threads_capacity =
7280  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7281  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7282  __kmp_tp_capacity = __kmp_default_tp_capacity(
7283  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7284 
7285  // If the library is shut down properly, both pools must be NULL. Just in
7286  // case, set them to NULL -- some memory may leak, but subsequent code will
7287  // work even if pools are not freed.
7288  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7289  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7290  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7291  __kmp_thread_pool = NULL;
7292  __kmp_thread_pool_insert_pt = NULL;
7293  __kmp_team_pool = NULL;
7294 
7295  /* Allocate all of the variable sized records */
7296  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7297  * expandable */
7298  /* Since allocation is cache-aligned, just add extra padding at the end */
7299  size =
7300  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7301  CACHE_LINE;
7302  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7303  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7304  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7305 
7306  /* init thread counts */
7307  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7308  0); // Asserts fail if the library is reinitializing and
7309  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7310  __kmp_all_nth = 0;
7311  __kmp_nth = 0;
7312 
7313  /* setup the uber master thread and hierarchy */
7314  gtid = __kmp_register_root(TRUE);
7315  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7316  KMP_ASSERT(KMP_UBER_GTID(gtid));
7317  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7318 
7319  KMP_MB(); /* Flush all pending memory write invalidates. */
7320 
7321  __kmp_common_initialize();
7322 
7323 #if KMP_OS_UNIX
7324  /* invoke the child fork handler */
7325  __kmp_register_atfork();
7326 #endif
7327 
7328 #if !KMP_DYNAMIC_LIB || \
7329  ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7330  {
7331  /* Invoke the exit handler when the program finishes, only for static
7332  library and macOS* dynamic. For other dynamic libraries, we already
7333  have _fini and DllMain. */
7334  int rc = atexit(__kmp_internal_end_atexit);
7335  if (rc != 0) {
7336  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7337  __kmp_msg_null);
7338  }
7339  }
7340 #endif
7341 
7342 #if KMP_HANDLE_SIGNALS
7343 #if KMP_OS_UNIX
7344  /* NOTE: make sure that this is called before the user installs their own
7345  signal handlers so that the user handlers are called first. this way they
7346  can return false, not call our handler, avoid terminating the library, and
7347  continue execution where they left off. */
7348  __kmp_install_signals(FALSE);
7349 #endif /* KMP_OS_UNIX */
7350 #if KMP_OS_WINDOWS
7351  __kmp_install_signals(TRUE);
7352 #endif /* KMP_OS_WINDOWS */
7353 #endif
7354 
7355  /* we have finished the serial initialization */
7356  __kmp_init_counter++;
7357 
7358  __kmp_init_serial = TRUE;
7359 
7360  if (__kmp_version) {
7361  __kmp_print_version_1();
7362  }
7363 
7364  if (__kmp_settings) {
7365  __kmp_env_print();
7366  }
7367 
7368  if (__kmp_display_env || __kmp_display_env_verbose) {
7369  __kmp_env_print_2();
7370  }
7371 
7372 #if OMPT_SUPPORT
7373  ompt_post_init();
7374 #endif
7375 
7376  KMP_MB();
7377 
7378  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7379 }
7380 
7381 void __kmp_serial_initialize(void) {
7382  if (__kmp_init_serial) {
7383  return;
7384  }
7385  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7386  if (__kmp_init_serial) {
7387  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7388  return;
7389  }
7390  __kmp_do_serial_initialize();
7391  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7392 }
7393 
7394 static void __kmp_do_middle_initialize(void) {
7395  int i, j;
7396  int prev_dflt_team_nth;
7397 
7398  if (!__kmp_init_serial) {
7399  __kmp_do_serial_initialize();
7400  }
7401 
7402  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7403 
7404  if (UNLIKELY(!__kmp_need_register_serial)) {
7405  // We are in a forked child process. The registration was skipped during
7406  // serial initialization in __kmp_atfork_child handler. Do it here.
7407  __kmp_register_library_startup();
7408  }
7409 
7410  // Save the previous value for the __kmp_dflt_team_nth so that
7411  // we can avoid some reinitialization if it hasn't changed.
7412  prev_dflt_team_nth = __kmp_dflt_team_nth;
7413 
7414 #if KMP_AFFINITY_SUPPORTED
7415  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7416  // number of cores on the machine.
7417  __kmp_affinity_initialize(__kmp_affinity);
7418 
7419 #endif /* KMP_AFFINITY_SUPPORTED */
7420 
7421  KMP_ASSERT(__kmp_xproc > 0);
7422  if (__kmp_avail_proc == 0) {
7423  __kmp_avail_proc = __kmp_xproc;
7424  }
7425 
7426  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7427  // correct them now
7428  j = 0;
7429  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7430  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7431  __kmp_avail_proc;
7432  j++;
7433  }
7434 
7435  if (__kmp_dflt_team_nth == 0) {
7436 #ifdef KMP_DFLT_NTH_CORES
7437  // Default #threads = #cores
7438  __kmp_dflt_team_nth = __kmp_ncores;
7439  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7440  "__kmp_ncores (%d)\n",
7441  __kmp_dflt_team_nth));
7442 #else
7443  // Default #threads = #available OS procs
7444  __kmp_dflt_team_nth = __kmp_avail_proc;
7445  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7446  "__kmp_avail_proc(%d)\n",
7447  __kmp_dflt_team_nth));
7448 #endif /* KMP_DFLT_NTH_CORES */
7449  }
7450 
7451  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7452  __kmp_dflt_team_nth = KMP_MIN_NTH;
7453  }
7454  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7455  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7456  }
7457 
7458  if (__kmp_nesting_mode > 0)
7459  __kmp_set_nesting_mode_threads();
7460 
7461  // There's no harm in continuing if the following check fails,
7462  // but it indicates an error in the previous logic.
7463  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7464 
7465  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7466  // Run through the __kmp_threads array and set the num threads icv for each
7467  // root thread that is currently registered with the RTL (which has not
7468  // already explicitly set its nthreads-var with a call to
7469  // omp_set_num_threads()).
7470  for (i = 0; i < __kmp_threads_capacity; i++) {
7471  kmp_info_t *thread = __kmp_threads[i];
7472  if (thread == NULL)
7473  continue;
7474  if (thread->th.th_current_task->td_icvs.nproc != 0)
7475  continue;
7476 
7477  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7478  }
7479  }
7480  KA_TRACE(
7481  20,
7482  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7483  __kmp_dflt_team_nth));
7484 
7485 #ifdef KMP_ADJUST_BLOCKTIME
7486  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7487  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7488  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7489  if (__kmp_nth > __kmp_avail_proc) {
7490  __kmp_zero_bt = TRUE;
7491  }
7492  }
7493 #endif /* KMP_ADJUST_BLOCKTIME */
7494 
7495  /* we have finished middle initialization */
7496  TCW_SYNC_4(__kmp_init_middle, TRUE);
7497 
7498  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7499 }
7500 
7501 void __kmp_middle_initialize(void) {
7502  if (__kmp_init_middle) {
7503  return;
7504  }
7505  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7506  if (__kmp_init_middle) {
7507  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7508  return;
7509  }
7510  __kmp_do_middle_initialize();
7511  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7512 }
7513 
7514 void __kmp_parallel_initialize(void) {
7515  int gtid = __kmp_entry_gtid(); // this might be a new root
7516 
7517  /* synchronize parallel initialization (for sibling) */
7518  if (TCR_4(__kmp_init_parallel))
7519  return;
7520  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7521  if (TCR_4(__kmp_init_parallel)) {
7522  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7523  return;
7524  }
7525 
7526  /* TODO reinitialization after we have already shut down */
7527  if (TCR_4(__kmp_global.g.g_done)) {
7528  KA_TRACE(
7529  10,
7530  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7531  __kmp_infinite_loop();
7532  }
7533 
7534  /* jc: The lock __kmp_initz_lock is already held, so calling
7535  __kmp_serial_initialize would cause a deadlock. So we call
7536  __kmp_do_serial_initialize directly. */
7537  if (!__kmp_init_middle) {
7538  __kmp_do_middle_initialize();
7539  }
7540  __kmp_assign_root_init_mask();
7541  __kmp_resume_if_hard_paused();
7542 
7543  /* begin initialization */
7544  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7545  KMP_ASSERT(KMP_UBER_GTID(gtid));
7546 
7547 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7548  // Save the FP control regs.
7549  // Worker threads will set theirs to these values at thread startup.
7550  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7551  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7552  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7553 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7554 
7555 #if KMP_OS_UNIX
7556 #if KMP_HANDLE_SIGNALS
7557  /* must be after __kmp_serial_initialize */
7558  __kmp_install_signals(TRUE);
7559 #endif
7560 #endif
7561 
7562  __kmp_suspend_initialize();
7563 
7564 #if defined(USE_LOAD_BALANCE)
7565  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7566  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7567  }
7568 #else
7569  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7570  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7571  }
7572 #endif
7573 
7574  if (__kmp_version) {
7575  __kmp_print_version_2();
7576  }
7577 
7578  /* we have finished parallel initialization */
7579  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7580 
7581  KMP_MB();
7582  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7583 
7584  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7585 }
7586 
7587 void __kmp_hidden_helper_initialize() {
7588  if (TCR_4(__kmp_init_hidden_helper))
7589  return;
7590 
7591  // __kmp_parallel_initialize is required before we initialize hidden helper
7592  if (!TCR_4(__kmp_init_parallel))
7593  __kmp_parallel_initialize();
7594 
7595  // Double check. Note that this double check should not be placed before
7596  // __kmp_parallel_initialize as it will cause dead lock.
7597  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7598  if (TCR_4(__kmp_init_hidden_helper)) {
7599  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600  return;
7601  }
7602 
7603 #if KMP_AFFINITY_SUPPORTED
7604  // Initialize hidden helper affinity settings.
7605  // The above __kmp_parallel_initialize() will initialize
7606  // regular affinity (and topology) if not already done.
7607  if (!__kmp_hh_affinity.flags.initialized)
7608  __kmp_affinity_initialize(__kmp_hh_affinity);
7609 #endif
7610 
7611  // Set the count of hidden helper tasks to be executed to zero
7612  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7613 
7614  // Set the global variable indicating that we're initializing hidden helper
7615  // team/threads
7616  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7617 
7618  // Platform independent initialization
7619  __kmp_do_initialize_hidden_helper_threads();
7620 
7621  // Wait here for the finish of initialization of hidden helper teams
7622  __kmp_hidden_helper_threads_initz_wait();
7623 
7624  // We have finished hidden helper initialization
7625  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7626 
7627  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7628 }
7629 
7630 /* ------------------------------------------------------------------------ */
7631 
7632 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7633  kmp_team_t *team) {
7634  kmp_disp_t *dispatch;
7635 
7636  KMP_MB();
7637 
7638  /* none of the threads have encountered any constructs, yet. */
7639  this_thr->th.th_local.this_construct = 0;
7640 #if KMP_CACHE_MANAGE
7641  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7642 #endif /* KMP_CACHE_MANAGE */
7643  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7644  KMP_DEBUG_ASSERT(dispatch);
7645  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7646  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7647  // this_thr->th.th_info.ds.ds_tid ] );
7648 
7649  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7650  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7651  if (__kmp_env_consistency_check)
7652  __kmp_push_parallel(gtid, team->t.t_ident);
7653 
7654  KMP_MB(); /* Flush all pending memory write invalidates. */
7655 }
7656 
7657 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7658  kmp_team_t *team) {
7659  if (__kmp_env_consistency_check)
7660  __kmp_pop_parallel(gtid, team->t.t_ident);
7661 
7662  __kmp_finish_implicit_task(this_thr);
7663 }
7664 
7665 int __kmp_invoke_task_func(int gtid) {
7666  int rc;
7667  int tid = __kmp_tid_from_gtid(gtid);
7668  kmp_info_t *this_thr = __kmp_threads[gtid];
7669  kmp_team_t *team = this_thr->th.th_team;
7670 
7671  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7672 #if USE_ITT_BUILD
7673  if (__itt_stack_caller_create_ptr) {
7674  // inform ittnotify about entering user's code
7675  if (team->t.t_stack_id != NULL) {
7676  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7677  } else {
7678  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7679  __kmp_itt_stack_callee_enter(
7680  (__itt_caller)team->t.t_parent->t.t_stack_id);
7681  }
7682  }
7683 #endif /* USE_ITT_BUILD */
7684 #if INCLUDE_SSC_MARKS
7685  SSC_MARK_INVOKING();
7686 #endif
7687 
7688 #if OMPT_SUPPORT
7689  void *dummy;
7690  void **exit_frame_p;
7691  ompt_data_t *my_task_data;
7692  ompt_data_t *my_parallel_data;
7693  int ompt_team_size;
7694 
7695  if (ompt_enabled.enabled) {
7696  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7697  .ompt_task_info.frame.exit_frame.ptr);
7698  } else {
7699  exit_frame_p = &dummy;
7700  }
7701 
7702  my_task_data =
7703  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7704  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7705  if (ompt_enabled.ompt_callback_implicit_task) {
7706  ompt_team_size = team->t.t_nproc;
7707  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7708  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7709  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7710  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7711  }
7712 #endif
7713 
7714 #if KMP_STATS_ENABLED
7715  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7716  if (previous_state == stats_state_e::TEAMS_REGION) {
7717  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7718  } else {
7719  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7720  }
7721  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7722 #endif
7723 
7724  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7725  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7726 #if OMPT_SUPPORT
7727  ,
7728  exit_frame_p
7729 #endif
7730  );
7731 #if OMPT_SUPPORT
7732  *exit_frame_p = NULL;
7733  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7734 #endif
7735 
7736 #if KMP_STATS_ENABLED
7737  if (previous_state == stats_state_e::TEAMS_REGION) {
7738  KMP_SET_THREAD_STATE(previous_state);
7739  }
7740  KMP_POP_PARTITIONED_TIMER();
7741 #endif
7742 
7743 #if USE_ITT_BUILD
7744  if (__itt_stack_caller_create_ptr) {
7745  // inform ittnotify about leaving user's code
7746  if (team->t.t_stack_id != NULL) {
7747  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7748  } else {
7749  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7750  __kmp_itt_stack_callee_leave(
7751  (__itt_caller)team->t.t_parent->t.t_stack_id);
7752  }
7753  }
7754 #endif /* USE_ITT_BUILD */
7755  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7756 
7757  return rc;
7758 }
7759 
7760 void __kmp_teams_master(int gtid) {
7761  // This routine is called by all primary threads in teams construct
7762  kmp_info_t *thr = __kmp_threads[gtid];
7763  kmp_team_t *team = thr->th.th_team;
7764  ident_t *loc = team->t.t_ident;
7765  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7766  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7767  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7768  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7769  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7770 
7771  // This thread is a new CG root. Set up the proper variables.
7772  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7773  tmp->cg_root = thr; // Make thr the CG root
7774  // Init to thread limit stored when league primary threads were forked
7775  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7776  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7777  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7778  " cg_nthreads to 1\n",
7779  thr, tmp));
7780  tmp->up = thr->th.th_cg_roots;
7781  thr->th.th_cg_roots = tmp;
7782 
7783 // Launch league of teams now, but not let workers execute
7784 // (they hang on fork barrier until next parallel)
7785 #if INCLUDE_SSC_MARKS
7786  SSC_MARK_FORKING();
7787 #endif
7788  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7789  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7790  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7791 #if INCLUDE_SSC_MARKS
7792  SSC_MARK_JOINING();
7793 #endif
7794  // If the team size was reduced from the limit, set it to the new size
7795  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7796  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7797  // AC: last parameter "1" eliminates join barrier which won't work because
7798  // worker threads are in a fork barrier waiting for more parallel regions
7799  __kmp_join_call(loc, gtid
7800 #if OMPT_SUPPORT
7801  ,
7802  fork_context_intel
7803 #endif
7804  ,
7805  1);
7806 }
7807 
7808 int __kmp_invoke_teams_master(int gtid) {
7809  kmp_info_t *this_thr = __kmp_threads[gtid];
7810  kmp_team_t *team = this_thr->th.th_team;
7811 #if KMP_DEBUG
7812  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7813  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7814  (void *)__kmp_teams_master);
7815 #endif
7816  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7817 #if OMPT_SUPPORT
7818  int tid = __kmp_tid_from_gtid(gtid);
7819  ompt_data_t *task_data =
7820  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7821  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7822  if (ompt_enabled.ompt_callback_implicit_task) {
7823  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7824  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7825  ompt_task_initial);
7826  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7827  }
7828 #endif
7829  __kmp_teams_master(gtid);
7830 #if OMPT_SUPPORT
7831  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7832 #endif
7833  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7834  return 1;
7835 }
7836 
7837 /* this sets the requested number of threads for the next parallel region
7838  encountered by this team. since this should be enclosed in the forkjoin
7839  critical section it should avoid race conditions with asymmetrical nested
7840  parallelism */
7841 
7842 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7843  kmp_info_t *thr = __kmp_threads[gtid];
7844 
7845  if (num_threads > 0)
7846  thr->th.th_set_nproc = num_threads;
7847 }
7848 
7849 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7850  int num_threads) {
7851  KMP_DEBUG_ASSERT(thr);
7852  // Remember the number of threads for inner parallel regions
7853  if (!TCR_4(__kmp_init_middle))
7854  __kmp_middle_initialize(); // get internal globals calculated
7855  __kmp_assign_root_init_mask();
7856  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7857  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7858 
7859  if (num_threads == 0) {
7860  if (__kmp_teams_thread_limit > 0) {
7861  num_threads = __kmp_teams_thread_limit;
7862  } else {
7863  num_threads = __kmp_avail_proc / num_teams;
7864  }
7865  // adjust num_threads w/o warning as it is not user setting
7866  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7867  // no thread_limit clause specified - do not change thread-limit-var ICV
7868  if (num_threads > __kmp_dflt_team_nth) {
7869  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7870  }
7871  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7872  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7873  } // prevent team size to exceed thread-limit-var
7874  if (num_teams * num_threads > __kmp_teams_max_nth) {
7875  num_threads = __kmp_teams_max_nth / num_teams;
7876  }
7877  if (num_threads == 0) {
7878  num_threads = 1;
7879  }
7880  } else {
7881  if (num_threads < 0) {
7882  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7883  __kmp_msg_null);
7884  num_threads = 1;
7885  }
7886  // This thread will be the primary thread of the league primary threads
7887  // Store new thread limit; old limit is saved in th_cg_roots list
7888  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7889  // num_threads = min(num_threads, nthreads-var)
7890  if (num_threads > __kmp_dflt_team_nth) {
7891  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7892  }
7893  if (num_teams * num_threads > __kmp_teams_max_nth) {
7894  int new_threads = __kmp_teams_max_nth / num_teams;
7895  if (new_threads == 0) {
7896  new_threads = 1;
7897  }
7898  if (new_threads != num_threads) {
7899  if (!__kmp_reserve_warn) { // user asked for too many threads
7900  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7901  __kmp_msg(kmp_ms_warning,
7902  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7903  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7904  }
7905  }
7906  num_threads = new_threads;
7907  }
7908  }
7909  thr->th.th_teams_size.nth = num_threads;
7910 }
7911 
7912 /* this sets the requested number of teams for the teams region and/or
7913  the number of threads for the next parallel region encountered */
7914 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7915  int num_threads) {
7916  kmp_info_t *thr = __kmp_threads[gtid];
7917  if (num_teams < 0) {
7918  // OpenMP specification requires requested values to be positive,
7919  // but people can send us any value, so we'd better check
7920  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7921  __kmp_msg_null);
7922  num_teams = 1;
7923  }
7924  if (num_teams == 0) {
7925  if (__kmp_nteams > 0) {
7926  num_teams = __kmp_nteams;
7927  } else {
7928  num_teams = 1; // default number of teams is 1.
7929  }
7930  }
7931  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7932  if (!__kmp_reserve_warn) {
7933  __kmp_reserve_warn = 1;
7934  __kmp_msg(kmp_ms_warning,
7935  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7936  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7937  }
7938  num_teams = __kmp_teams_max_nth;
7939  }
7940  // Set number of teams (number of threads in the outer "parallel" of the
7941  // teams)
7942  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7943 
7944  __kmp_push_thread_limit(thr, num_teams, num_threads);
7945 }
7946 
7947 /* This sets the requested number of teams for the teams region and/or
7948  the number of threads for the next parallel region encountered */
7949 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7950  int num_teams_ub, int num_threads) {
7951  kmp_info_t *thr = __kmp_threads[gtid];
7952  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7953  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7954  KMP_DEBUG_ASSERT(num_threads >= 0);
7955 
7956  if (num_teams_lb > num_teams_ub) {
7957  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7958  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7959  }
7960 
7961  int num_teams = 1; // defalt number of teams is 1.
7962 
7963  if (num_teams_lb == 0 && num_teams_ub > 0)
7964  num_teams_lb = num_teams_ub;
7965 
7966  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7967  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7968  if (num_teams > __kmp_teams_max_nth) {
7969  if (!__kmp_reserve_warn) {
7970  __kmp_reserve_warn = 1;
7971  __kmp_msg(kmp_ms_warning,
7972  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7973  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7974  }
7975  num_teams = __kmp_teams_max_nth;
7976  }
7977  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7978  num_teams = num_teams_ub;
7979  } else { // num_teams_lb <= num_teams <= num_teams_ub
7980  if (num_threads <= 0) {
7981  if (num_teams_ub > __kmp_teams_max_nth) {
7982  num_teams = num_teams_lb;
7983  } else {
7984  num_teams = num_teams_ub;
7985  }
7986  } else {
7987  num_teams = (num_threads > __kmp_teams_max_nth)
7988  ? num_teams
7989  : __kmp_teams_max_nth / num_threads;
7990  if (num_teams < num_teams_lb) {
7991  num_teams = num_teams_lb;
7992  } else if (num_teams > num_teams_ub) {
7993  num_teams = num_teams_ub;
7994  }
7995  }
7996  }
7997  // Set number of teams (number of threads in the outer "parallel" of the
7998  // teams)
7999  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8000 
8001  __kmp_push_thread_limit(thr, num_teams, num_threads);
8002 }
8003 
8004 // Set the proc_bind var to use in the following parallel region.
8005 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8006  kmp_info_t *thr = __kmp_threads[gtid];
8007  thr->th.th_set_proc_bind = proc_bind;
8008 }
8009 
8010 /* Launch the worker threads into the microtask. */
8011 
8012 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8013  kmp_info_t *this_thr = __kmp_threads[gtid];
8014 
8015 #ifdef KMP_DEBUG
8016  int f;
8017 #endif /* KMP_DEBUG */
8018 
8019  KMP_DEBUG_ASSERT(team);
8020  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8021  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8022  KMP_MB(); /* Flush all pending memory write invalidates. */
8023 
8024  team->t.t_construct = 0; /* no single directives seen yet */
8025  team->t.t_ordered.dt.t_value =
8026  0; /* thread 0 enters the ordered section first */
8027 
8028  /* Reset the identifiers on the dispatch buffer */
8029  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8030  if (team->t.t_max_nproc > 1) {
8031  int i;
8032  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8033  team->t.t_disp_buffer[i].buffer_index = i;
8034  team->t.t_disp_buffer[i].doacross_buf_idx = i;
8035  }
8036  } else {
8037  team->t.t_disp_buffer[0].buffer_index = 0;
8038  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8039  }
8040 
8041  KMP_MB(); /* Flush all pending memory write invalidates. */
8042  KMP_ASSERT(this_thr->th.th_team == team);
8043 
8044 #ifdef KMP_DEBUG
8045  for (f = 0; f < team->t.t_nproc; f++) {
8046  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8047  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8048  }
8049 #endif /* KMP_DEBUG */
8050 
8051  /* release the worker threads so they may begin working */
8052  __kmp_fork_barrier(gtid, 0);
8053 }
8054 
8055 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8056  kmp_info_t *this_thr = __kmp_threads[gtid];
8057 
8058  KMP_DEBUG_ASSERT(team);
8059  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8060  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8061  KMP_MB(); /* Flush all pending memory write invalidates. */
8062 
8063  /* Join barrier after fork */
8064 
8065 #ifdef KMP_DEBUG
8066  if (__kmp_threads[gtid] &&
8067  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8068  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8069  __kmp_threads[gtid]);
8070  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8071  "team->t.t_nproc=%d\n",
8072  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8073  team->t.t_nproc);
8074  __kmp_print_structure();
8075  }
8076  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8077  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8078 #endif /* KMP_DEBUG */
8079 
8080  __kmp_join_barrier(gtid); /* wait for everyone */
8081 #if OMPT_SUPPORT
8082  if (ompt_enabled.enabled &&
8083  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8084  int ds_tid = this_thr->th.th_info.ds.ds_tid;
8085  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8086  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8087 #if OMPT_OPTIONAL
8088  void *codeptr = NULL;
8089  if (KMP_MASTER_TID(ds_tid) &&
8090  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8091  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8092  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8093 
8094  if (ompt_enabled.ompt_callback_sync_region_wait) {
8095  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8096  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8097  codeptr);
8098  }
8099  if (ompt_enabled.ompt_callback_sync_region) {
8100  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8101  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8102  codeptr);
8103  }
8104 #endif
8105  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8106  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8107  ompt_scope_end, NULL, task_data, 0, ds_tid,
8108  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8109  }
8110  }
8111 #endif
8112 
8113  KMP_MB(); /* Flush all pending memory write invalidates. */
8114  KMP_ASSERT(this_thr->th.th_team == team);
8115 }
8116 
8117 /* ------------------------------------------------------------------------ */
8118 
8119 #ifdef USE_LOAD_BALANCE
8120 
8121 // Return the worker threads actively spinning in the hot team, if we
8122 // are at the outermost level of parallelism. Otherwise, return 0.
8123 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8124  int i;
8125  int retval;
8126  kmp_team_t *hot_team;
8127 
8128  if (root->r.r_active) {
8129  return 0;
8130  }
8131  hot_team = root->r.r_hot_team;
8132  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8133  return hot_team->t.t_nproc - 1; // Don't count primary thread
8134  }
8135 
8136  // Skip the primary thread - it is accounted for elsewhere.
8137  retval = 0;
8138  for (i = 1; i < hot_team->t.t_nproc; i++) {
8139  if (hot_team->t.t_threads[i]->th.th_active) {
8140  retval++;
8141  }
8142  }
8143  return retval;
8144 }
8145 
8146 // Perform an automatic adjustment to the number of
8147 // threads used by the next parallel region.
8148 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8149  int retval;
8150  int pool_active;
8151  int hot_team_active;
8152  int team_curr_active;
8153  int system_active;
8154 
8155  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8156  set_nproc));
8157  KMP_DEBUG_ASSERT(root);
8158  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8159  ->th.th_current_task->td_icvs.dynamic == TRUE);
8160  KMP_DEBUG_ASSERT(set_nproc > 1);
8161 
8162  if (set_nproc == 1) {
8163  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8164  return 1;
8165  }
8166 
8167  // Threads that are active in the thread pool, active in the hot team for this
8168  // particular root (if we are at the outer par level), and the currently
8169  // executing thread (to become the primary thread) are available to add to the
8170  // new team, but are currently contributing to the system load, and must be
8171  // accounted for.
8172  pool_active = __kmp_thread_pool_active_nth;
8173  hot_team_active = __kmp_active_hot_team_nproc(root);
8174  team_curr_active = pool_active + hot_team_active + 1;
8175 
8176  // Check the system load.
8177  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8178  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8179  "hot team active = %d\n",
8180  system_active, pool_active, hot_team_active));
8181 
8182  if (system_active < 0) {
8183  // There was an error reading the necessary info from /proc, so use the
8184  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8185  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8186  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8187  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8188 
8189  // Make this call behave like the thread limit algorithm.
8190  retval = __kmp_avail_proc - __kmp_nth +
8191  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8192  if (retval > set_nproc) {
8193  retval = set_nproc;
8194  }
8195  if (retval < KMP_MIN_NTH) {
8196  retval = KMP_MIN_NTH;
8197  }
8198 
8199  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8200  retval));
8201  return retval;
8202  }
8203 
8204  // There is a slight delay in the load balance algorithm in detecting new
8205  // running procs. The real system load at this instant should be at least as
8206  // large as the #active omp thread that are available to add to the team.
8207  if (system_active < team_curr_active) {
8208  system_active = team_curr_active;
8209  }
8210  retval = __kmp_avail_proc - system_active + team_curr_active;
8211  if (retval > set_nproc) {
8212  retval = set_nproc;
8213  }
8214  if (retval < KMP_MIN_NTH) {
8215  retval = KMP_MIN_NTH;
8216  }
8217 
8218  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8219  return retval;
8220 } // __kmp_load_balance_nproc()
8221 
8222 #endif /* USE_LOAD_BALANCE */
8223 
8224 /* ------------------------------------------------------------------------ */
8225 
8226 /* NOTE: this is called with the __kmp_init_lock held */
8227 void __kmp_cleanup(void) {
8228  int f;
8229 
8230  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8231 
8232  if (TCR_4(__kmp_init_parallel)) {
8233 #if KMP_HANDLE_SIGNALS
8234  __kmp_remove_signals();
8235 #endif
8236  TCW_4(__kmp_init_parallel, FALSE);
8237  }
8238 
8239  if (TCR_4(__kmp_init_middle)) {
8240 #if KMP_AFFINITY_SUPPORTED
8241  __kmp_affinity_uninitialize();
8242 #endif /* KMP_AFFINITY_SUPPORTED */
8243  __kmp_cleanup_hierarchy();
8244  TCW_4(__kmp_init_middle, FALSE);
8245  }
8246 
8247  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8248 
8249  if (__kmp_init_serial) {
8250  __kmp_runtime_destroy();
8251  __kmp_init_serial = FALSE;
8252  }
8253 
8254  __kmp_cleanup_threadprivate_caches();
8255 
8256  for (f = 0; f < __kmp_threads_capacity; f++) {
8257  if (__kmp_root[f] != NULL) {
8258  __kmp_free(__kmp_root[f]);
8259  __kmp_root[f] = NULL;
8260  }
8261  }
8262  __kmp_free(__kmp_threads);
8263  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8264  // there is no need in freeing __kmp_root.
8265  __kmp_threads = NULL;
8266  __kmp_root = NULL;
8267  __kmp_threads_capacity = 0;
8268 
8269  // Free old __kmp_threads arrays if they exist.
8270  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8271  while (ptr) {
8272  kmp_old_threads_list_t *next = ptr->next;
8273  __kmp_free(ptr->threads);
8274  __kmp_free(ptr);
8275  ptr = next;
8276  }
8277 
8278 #if KMP_USE_DYNAMIC_LOCK
8279  __kmp_cleanup_indirect_user_locks();
8280 #else
8281  __kmp_cleanup_user_locks();
8282 #endif
8283 #if OMPD_SUPPORT
8284  if (ompd_state) {
8285  __kmp_free(ompd_env_block);
8286  ompd_env_block = NULL;
8287  ompd_env_block_size = 0;
8288  }
8289 #endif
8290 
8291 #if KMP_AFFINITY_SUPPORTED
8292  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8293  __kmp_cpuinfo_file = NULL;
8294 #endif /* KMP_AFFINITY_SUPPORTED */
8295 
8296 #if KMP_USE_ADAPTIVE_LOCKS
8297 #if KMP_DEBUG_ADAPTIVE_LOCKS
8298  __kmp_print_speculative_stats();
8299 #endif
8300 #endif
8301  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8302  __kmp_nested_nth.nth = NULL;
8303  __kmp_nested_nth.size = 0;
8304  __kmp_nested_nth.used = 0;
8305  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8306  __kmp_nested_proc_bind.bind_types = NULL;
8307  __kmp_nested_proc_bind.size = 0;
8308  __kmp_nested_proc_bind.used = 0;
8309  if (__kmp_affinity_format) {
8310  KMP_INTERNAL_FREE(__kmp_affinity_format);
8311  __kmp_affinity_format = NULL;
8312  }
8313 
8314  __kmp_i18n_catclose();
8315 
8316 #if KMP_USE_HIER_SCHED
8317  __kmp_hier_scheds.deallocate();
8318 #endif
8319 
8320 #if KMP_STATS_ENABLED
8321  __kmp_stats_fini();
8322 #endif
8323 
8324  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8325 }
8326 
8327 /* ------------------------------------------------------------------------ */
8328 
8329 int __kmp_ignore_mppbeg(void) {
8330  char *env;
8331 
8332  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8333  if (__kmp_str_match_false(env))
8334  return FALSE;
8335  }
8336  // By default __kmpc_begin() is no-op.
8337  return TRUE;
8338 }
8339 
8340 int __kmp_ignore_mppend(void) {
8341  char *env;
8342 
8343  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8344  if (__kmp_str_match_false(env))
8345  return FALSE;
8346  }
8347  // By default __kmpc_end() is no-op.
8348  return TRUE;
8349 }
8350 
8351 void __kmp_internal_begin(void) {
8352  int gtid;
8353  kmp_root_t *root;
8354 
8355  /* this is a very important step as it will register new sibling threads
8356  and assign these new uber threads a new gtid */
8357  gtid = __kmp_entry_gtid();
8358  root = __kmp_threads[gtid]->th.th_root;
8359  KMP_ASSERT(KMP_UBER_GTID(gtid));
8360 
8361  if (root->r.r_begin)
8362  return;
8363  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8364  if (root->r.r_begin) {
8365  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8366  return;
8367  }
8368 
8369  root->r.r_begin = TRUE;
8370 
8371  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8372 }
8373 
8374 /* ------------------------------------------------------------------------ */
8375 
8376 void __kmp_user_set_library(enum library_type arg) {
8377  int gtid;
8378  kmp_root_t *root;
8379  kmp_info_t *thread;
8380 
8381  /* first, make sure we are initialized so we can get our gtid */
8382 
8383  gtid = __kmp_entry_gtid();
8384  thread = __kmp_threads[gtid];
8385 
8386  root = thread->th.th_root;
8387 
8388  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8389  library_serial));
8390  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8391  thread */
8392  KMP_WARNING(SetLibraryIncorrectCall);
8393  return;
8394  }
8395 
8396  switch (arg) {
8397  case library_serial:
8398  thread->th.th_set_nproc = 0;
8399  set__nproc(thread, 1);
8400  break;
8401  case library_turnaround:
8402  thread->th.th_set_nproc = 0;
8403  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8404  : __kmp_dflt_team_nth_ub);
8405  break;
8406  case library_throughput:
8407  thread->th.th_set_nproc = 0;
8408  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8409  : __kmp_dflt_team_nth_ub);
8410  break;
8411  default:
8412  KMP_FATAL(UnknownLibraryType, arg);
8413  }
8414 
8415  __kmp_aux_set_library(arg);
8416 }
8417 
8418 void __kmp_aux_set_stacksize(size_t arg) {
8419  if (!__kmp_init_serial)
8420  __kmp_serial_initialize();
8421 
8422 #if KMP_OS_DARWIN
8423  if (arg & (0x1000 - 1)) {
8424  arg &= ~(0x1000 - 1);
8425  if (arg + 0x1000) /* check for overflow if we round up */
8426  arg += 0x1000;
8427  }
8428 #endif
8429  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8430 
8431  /* only change the default stacksize before the first parallel region */
8432  if (!TCR_4(__kmp_init_parallel)) {
8433  size_t value = arg; /* argument is in bytes */
8434 
8435  if (value < __kmp_sys_min_stksize)
8436  value = __kmp_sys_min_stksize;
8437  else if (value > KMP_MAX_STKSIZE)
8438  value = KMP_MAX_STKSIZE;
8439 
8440  __kmp_stksize = value;
8441 
8442  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8443  }
8444 
8445  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8446 }
8447 
8448 /* set the behaviour of the runtime library */
8449 /* TODO this can cause some odd behaviour with sibling parallelism... */
8450 void __kmp_aux_set_library(enum library_type arg) {
8451  __kmp_library = arg;
8452 
8453  switch (__kmp_library) {
8454  case library_serial: {
8455  KMP_INFORM(LibraryIsSerial);
8456  } break;
8457  case library_turnaround:
8458  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8459  __kmp_use_yield = 2; // only yield when oversubscribed
8460  break;
8461  case library_throughput:
8462  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8463  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8464  break;
8465  default:
8466  KMP_FATAL(UnknownLibraryType, arg);
8467  }
8468 }
8469 
8470 /* Getting team information common for all team API */
8471 // Returns NULL if not in teams construct
8472 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8473  kmp_info_t *thr = __kmp_entry_thread();
8474  teams_serialized = 0;
8475  if (thr->th.th_teams_microtask) {
8476  kmp_team_t *team = thr->th.th_team;
8477  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8478  int ii = team->t.t_level;
8479  teams_serialized = team->t.t_serialized;
8480  int level = tlevel + 1;
8481  KMP_DEBUG_ASSERT(ii >= tlevel);
8482  while (ii > level) {
8483  for (teams_serialized = team->t.t_serialized;
8484  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8485  }
8486  if (team->t.t_serialized && (!teams_serialized)) {
8487  team = team->t.t_parent;
8488  continue;
8489  }
8490  if (ii > level) {
8491  team = team->t.t_parent;
8492  ii--;
8493  }
8494  }
8495  return team;
8496  }
8497  return NULL;
8498 }
8499 
8500 int __kmp_aux_get_team_num() {
8501  int serialized;
8502  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8503  if (team) {
8504  if (serialized > 1) {
8505  return 0; // teams region is serialized ( 1 team of 1 thread ).
8506  } else {
8507  return team->t.t_master_tid;
8508  }
8509  }
8510  return 0;
8511 }
8512 
8513 int __kmp_aux_get_num_teams() {
8514  int serialized;
8515  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8516  if (team) {
8517  if (serialized > 1) {
8518  return 1;
8519  } else {
8520  return team->t.t_parent->t.t_nproc;
8521  }
8522  }
8523  return 1;
8524 }
8525 
8526 /* ------------------------------------------------------------------------ */
8527 
8528 /*
8529  * Affinity Format Parser
8530  *
8531  * Field is in form of: %[[[0].]size]type
8532  * % and type are required (%% means print a literal '%')
8533  * type is either single char or long name surrounded by {},
8534  * e.g., N or {num_threads}
8535  * 0 => leading zeros
8536  * . => right justified when size is specified
8537  * by default output is left justified
8538  * size is the *minimum* field length
8539  * All other characters are printed as is
8540  *
8541  * Available field types:
8542  * L {thread_level} - omp_get_level()
8543  * n {thread_num} - omp_get_thread_num()
8544  * h {host} - name of host machine
8545  * P {process_id} - process id (integer)
8546  * T {thread_identifier} - native thread identifier (integer)
8547  * N {num_threads} - omp_get_num_threads()
8548  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8549  * a {thread_affinity} - comma separated list of integers or integer ranges
8550  * (values of affinity mask)
8551  *
8552  * Implementation-specific field types can be added
8553  * If a type is unknown, print "undefined"
8554  */
8555 
8556 // Structure holding the short name, long name, and corresponding data type
8557 // for snprintf. A table of these will represent the entire valid keyword
8558 // field types.
8559 typedef struct kmp_affinity_format_field_t {
8560  char short_name; // from spec e.g., L -> thread level
8561  const char *long_name; // from spec thread_level -> thread level
8562  char field_format; // data type for snprintf (typically 'd' or 's'
8563  // for integer or string)
8564 } kmp_affinity_format_field_t;
8565 
8566 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8567 #if KMP_AFFINITY_SUPPORTED
8568  {'A', "thread_affinity", 's'},
8569 #endif
8570  {'t', "team_num", 'd'},
8571  {'T', "num_teams", 'd'},
8572  {'L', "nesting_level", 'd'},
8573  {'n', "thread_num", 'd'},
8574  {'N', "num_threads", 'd'},
8575  {'a', "ancestor_tnum", 'd'},
8576  {'H', "host", 's'},
8577  {'P', "process_id", 'd'},
8578  {'i', "native_thread_id", 'd'}};
8579 
8580 // Return the number of characters it takes to hold field
8581 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8582  const char **ptr,
8583  kmp_str_buf_t *field_buffer) {
8584  int rc, format_index, field_value;
8585  const char *width_left, *width_right;
8586  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8587  static const int FORMAT_SIZE = 20;
8588  char format[FORMAT_SIZE] = {0};
8589  char absolute_short_name = 0;
8590 
8591  KMP_DEBUG_ASSERT(gtid >= 0);
8592  KMP_DEBUG_ASSERT(th);
8593  KMP_DEBUG_ASSERT(**ptr == '%');
8594  KMP_DEBUG_ASSERT(field_buffer);
8595 
8596  __kmp_str_buf_clear(field_buffer);
8597 
8598  // Skip the initial %
8599  (*ptr)++;
8600 
8601  // Check for %% first
8602  if (**ptr == '%') {
8603  __kmp_str_buf_cat(field_buffer, "%", 1);
8604  (*ptr)++; // skip over the second %
8605  return 1;
8606  }
8607 
8608  // Parse field modifiers if they are present
8609  pad_zeros = false;
8610  if (**ptr == '0') {
8611  pad_zeros = true;
8612  (*ptr)++; // skip over 0
8613  }
8614  right_justify = false;
8615  if (**ptr == '.') {
8616  right_justify = true;
8617  (*ptr)++; // skip over .
8618  }
8619  // Parse width of field: [width_left, width_right)
8620  width_left = width_right = NULL;
8621  if (**ptr >= '0' && **ptr <= '9') {
8622  width_left = *ptr;
8623  SKIP_DIGITS(*ptr);
8624  width_right = *ptr;
8625  }
8626 
8627  // Create the format for KMP_SNPRINTF based on flags parsed above
8628  format_index = 0;
8629  format[format_index++] = '%';
8630  if (!right_justify)
8631  format[format_index++] = '-';
8632  if (pad_zeros)
8633  format[format_index++] = '0';
8634  if (width_left && width_right) {
8635  int i = 0;
8636  // Only allow 8 digit number widths.
8637  // This also prevents overflowing format variable
8638  while (i < 8 && width_left < width_right) {
8639  format[format_index++] = *width_left;
8640  width_left++;
8641  i++;
8642  }
8643  }
8644 
8645  // Parse a name (long or short)
8646  // Canonicalize the name into absolute_short_name
8647  found_valid_name = false;
8648  parse_long_name = (**ptr == '{');
8649  if (parse_long_name)
8650  (*ptr)++; // skip initial left brace
8651  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8652  sizeof(__kmp_affinity_format_table[0]);
8653  ++i) {
8654  char short_name = __kmp_affinity_format_table[i].short_name;
8655  const char *long_name = __kmp_affinity_format_table[i].long_name;
8656  char field_format = __kmp_affinity_format_table[i].field_format;
8657  if (parse_long_name) {
8658  size_t length = KMP_STRLEN(long_name);
8659  if (strncmp(*ptr, long_name, length) == 0) {
8660  found_valid_name = true;
8661  (*ptr) += length; // skip the long name
8662  }
8663  } else if (**ptr == short_name) {
8664  found_valid_name = true;
8665  (*ptr)++; // skip the short name
8666  }
8667  if (found_valid_name) {
8668  format[format_index++] = field_format;
8669  format[format_index++] = '\0';
8670  absolute_short_name = short_name;
8671  break;
8672  }
8673  }
8674  if (parse_long_name) {
8675  if (**ptr != '}') {
8676  absolute_short_name = 0;
8677  } else {
8678  (*ptr)++; // skip over the right brace
8679  }
8680  }
8681 
8682  // Attempt to fill the buffer with the requested
8683  // value using snprintf within __kmp_str_buf_print()
8684  switch (absolute_short_name) {
8685  case 't':
8686  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8687  break;
8688  case 'T':
8689  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8690  break;
8691  case 'L':
8692  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8693  break;
8694  case 'n':
8695  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8696  break;
8697  case 'H': {
8698  static const int BUFFER_SIZE = 256;
8699  char buf[BUFFER_SIZE];
8700  __kmp_expand_host_name(buf, BUFFER_SIZE);
8701  rc = __kmp_str_buf_print(field_buffer, format, buf);
8702  } break;
8703  case 'P':
8704  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8705  break;
8706  case 'i':
8707  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8708  break;
8709  case 'N':
8710  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8711  break;
8712  case 'a':
8713  field_value =
8714  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8715  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8716  break;
8717 #if KMP_AFFINITY_SUPPORTED
8718  case 'A': {
8719  kmp_str_buf_t buf;
8720  __kmp_str_buf_init(&buf);
8721  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8722  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8723  __kmp_str_buf_free(&buf);
8724  } break;
8725 #endif
8726  default:
8727  // According to spec, If an implementation does not have info for field
8728  // type, then "undefined" is printed
8729  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8730  // Skip the field
8731  if (parse_long_name) {
8732  SKIP_TOKEN(*ptr);
8733  if (**ptr == '}')
8734  (*ptr)++;
8735  } else {
8736  (*ptr)++;
8737  }
8738  }
8739 
8740  KMP_ASSERT(format_index <= FORMAT_SIZE);
8741  return rc;
8742 }
8743 
8744 /*
8745  * Return number of characters needed to hold the affinity string
8746  * (not including null byte character)
8747  * The resultant string is printed to buffer, which the caller can then
8748  * handle afterwards
8749  */
8750 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8751  kmp_str_buf_t *buffer) {
8752  const char *parse_ptr;
8753  size_t retval;
8754  const kmp_info_t *th;
8755  kmp_str_buf_t field;
8756 
8757  KMP_DEBUG_ASSERT(buffer);
8758  KMP_DEBUG_ASSERT(gtid >= 0);
8759 
8760  __kmp_str_buf_init(&field);
8761  __kmp_str_buf_clear(buffer);
8762 
8763  th = __kmp_threads[gtid];
8764  retval = 0;
8765 
8766  // If format is NULL or zero-length string, then we use
8767  // affinity-format-var ICV
8768  parse_ptr = format;
8769  if (parse_ptr == NULL || *parse_ptr == '\0') {
8770  parse_ptr = __kmp_affinity_format;
8771  }
8772  KMP_DEBUG_ASSERT(parse_ptr);
8773 
8774  while (*parse_ptr != '\0') {
8775  // Parse a field
8776  if (*parse_ptr == '%') {
8777  // Put field in the buffer
8778  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8779  __kmp_str_buf_catbuf(buffer, &field);
8780  retval += rc;
8781  } else {
8782  // Put literal character in buffer
8783  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8784  retval++;
8785  parse_ptr++;
8786  }
8787  }
8788  __kmp_str_buf_free(&field);
8789  return retval;
8790 }
8791 
8792 // Displays the affinity string to stdout
8793 void __kmp_aux_display_affinity(int gtid, const char *format) {
8794  kmp_str_buf_t buf;
8795  __kmp_str_buf_init(&buf);
8796  __kmp_aux_capture_affinity(gtid, format, &buf);
8797  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8798  __kmp_str_buf_free(&buf);
8799 }
8800 
8801 /* ------------------------------------------------------------------------ */
8802 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8803  int blocktime = arg; /* argument is in microseconds */
8804 #if KMP_USE_MONITOR
8805  int bt_intervals;
8806 #endif
8807  kmp_int8 bt_set;
8808 
8809  __kmp_save_internal_controls(thread);
8810 
8811  /* Normalize and set blocktime for the teams */
8812  if (blocktime < KMP_MIN_BLOCKTIME)
8813  blocktime = KMP_MIN_BLOCKTIME;
8814  else if (blocktime > KMP_MAX_BLOCKTIME)
8815  blocktime = KMP_MAX_BLOCKTIME;
8816 
8817  set__blocktime_team(thread->th.th_team, tid, blocktime);
8818  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8819 
8820 #if KMP_USE_MONITOR
8821  /* Calculate and set blocktime intervals for the teams */
8822  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8823 
8824  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8825  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8826 #endif
8827 
8828  /* Set whether blocktime has been set to "TRUE" */
8829  bt_set = TRUE;
8830 
8831  set__bt_set_team(thread->th.th_team, tid, bt_set);
8832  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8833 #if KMP_USE_MONITOR
8834  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8835  "bt_intervals=%d, monitor_updates=%d\n",
8836  __kmp_gtid_from_tid(tid, thread->th.th_team),
8837  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8838  __kmp_monitor_wakeups));
8839 #else
8840  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8841  __kmp_gtid_from_tid(tid, thread->th.th_team),
8842  thread->th.th_team->t.t_id, tid, blocktime));
8843 #endif
8844 }
8845 
8846 void __kmp_aux_set_defaults(char const *str, size_t len) {
8847  if (!__kmp_init_serial) {
8848  __kmp_serial_initialize();
8849  }
8850  __kmp_env_initialize(str);
8851 
8852  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8853  __kmp_env_print();
8854  }
8855 } // __kmp_aux_set_defaults
8856 
8857 /* ------------------------------------------------------------------------ */
8858 /* internal fast reduction routines */
8859 
8860 PACKED_REDUCTION_METHOD_T
8861 __kmp_determine_reduction_method(
8862  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8863  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8864  kmp_critical_name *lck) {
8865 
8866  // Default reduction method: critical construct ( lck != NULL, like in current
8867  // PAROPT )
8868  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8869  // can be selected by RTL
8870  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8871  // can be selected by RTL
8872  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8873  // among generated by PAROPT.
8874 
8875  PACKED_REDUCTION_METHOD_T retval;
8876 
8877  int team_size;
8878 
8879  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8880 
8881 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8882  (loc && \
8883  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8884 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8885 
8886  retval = critical_reduce_block;
8887 
8888  // another choice of getting a team size (with 1 dynamic deference) is slower
8889  team_size = __kmp_get_team_num_threads(global_tid);
8890  if (team_size == 1) {
8891 
8892  retval = empty_reduce_block;
8893 
8894  } else {
8895 
8896  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8897 
8898 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8899  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8900  KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8901 
8902 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8903  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || \
8904  KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8905 
8906  int teamsize_cutoff = 4;
8907 
8908 #if KMP_MIC_SUPPORTED
8909  if (__kmp_mic_type != non_mic) {
8910  teamsize_cutoff = 8;
8911  }
8912 #endif
8913  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8914  if (tree_available) {
8915  if (team_size <= teamsize_cutoff) {
8916  if (atomic_available) {
8917  retval = atomic_reduce_block;
8918  }
8919  } else {
8920  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8921  }
8922  } else if (atomic_available) {
8923  retval = atomic_reduce_block;
8924  }
8925 #else
8926 #error "Unknown or unsupported OS"
8927 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8928  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD ||
8929  // KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8930 
8931 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8932  KMP_ARCH_WASM || KMP_ARCH_PPC
8933 
8934 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8935  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_SOLARIS || \
8936  KMP_OS_WASI || KMP_OS_AIX
8937 
8938  // basic tuning
8939 
8940  if (atomic_available) {
8941  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8942  retval = atomic_reduce_block;
8943  }
8944  } // otherwise: use critical section
8945 
8946 #elif KMP_OS_DARWIN
8947 
8948  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8949  if (atomic_available && (num_vars <= 3)) {
8950  retval = atomic_reduce_block;
8951  } else if (tree_available) {
8952  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8953  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8954  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8955  }
8956  } // otherwise: use critical section
8957 
8958 #else
8959 #error "Unknown or unsupported OS"
8960 #endif
8961 
8962 #else
8963 #error "Unknown or unsupported architecture"
8964 #endif
8965  }
8966 
8967  // KMP_FORCE_REDUCTION
8968 
8969  // If the team is serialized (team_size == 1), ignore the forced reduction
8970  // method and stay with the unsynchronized method (empty_reduce_block)
8971  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8972  team_size != 1) {
8973 
8974  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8975 
8976  int atomic_available, tree_available;
8977 
8978  switch ((forced_retval = __kmp_force_reduction_method)) {
8979  case critical_reduce_block:
8980  KMP_ASSERT(lck); // lck should be != 0
8981  break;
8982 
8983  case atomic_reduce_block:
8984  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8985  if (!atomic_available) {
8986  KMP_WARNING(RedMethodNotSupported, "atomic");
8987  forced_retval = critical_reduce_block;
8988  }
8989  break;
8990 
8991  case tree_reduce_block:
8992  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8993  if (!tree_available) {
8994  KMP_WARNING(RedMethodNotSupported, "tree");
8995  forced_retval = critical_reduce_block;
8996  } else {
8997 #if KMP_FAST_REDUCTION_BARRIER
8998  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8999 #endif
9000  }
9001  break;
9002 
9003  default:
9004  KMP_ASSERT(0); // "unsupported method specified"
9005  }
9006 
9007  retval = forced_retval;
9008  }
9009 
9010  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9011 
9012 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9013 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9014 
9015  return (retval);
9016 }
9017 // this function is for testing set/get/determine reduce method
9018 kmp_int32 __kmp_get_reduce_method(void) {
9019  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9020 }
9021 
9022 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9023 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9024 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9025 
9026 // Hard pause shuts down the runtime completely. Resume happens naturally when
9027 // OpenMP is used subsequently.
9028 void __kmp_hard_pause() {
9029  __kmp_pause_status = kmp_hard_paused;
9030  __kmp_internal_end_thread(-1);
9031 }
9032 
9033 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9034 void __kmp_resume_if_soft_paused() {
9035  if (__kmp_pause_status == kmp_soft_paused) {
9036  __kmp_pause_status = kmp_not_paused;
9037 
9038  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9039  kmp_info_t *thread = __kmp_threads[gtid];
9040  if (thread) { // Wake it if sleeping
9041  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9042  thread);
9043  if (fl.is_sleeping())
9044  fl.resume(gtid);
9045  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9046  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9047  } else { // thread holds the lock and may sleep soon
9048  do { // until either the thread sleeps, or we can get the lock
9049  if (fl.is_sleeping()) {
9050  fl.resume(gtid);
9051  break;
9052  } else if (__kmp_try_suspend_mx(thread)) {
9053  __kmp_unlock_suspend_mx(thread);
9054  break;
9055  }
9056  } while (1);
9057  }
9058  }
9059  }
9060  }
9061 }
9062 
9063 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9064 // TODO: add warning messages
9065 int __kmp_pause_resource(kmp_pause_status_t level) {
9066  if (level == kmp_not_paused) { // requesting resume
9067  if (__kmp_pause_status == kmp_not_paused) {
9068  // error message about runtime not being paused, so can't resume
9069  return 1;
9070  } else {
9071  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9072  __kmp_pause_status == kmp_hard_paused);
9073  __kmp_pause_status = kmp_not_paused;
9074  return 0;
9075  }
9076  } else if (level == kmp_soft_paused) { // requesting soft pause
9077  if (__kmp_pause_status != kmp_not_paused) {
9078  // error message about already being paused
9079  return 1;
9080  } else {
9081  __kmp_soft_pause();
9082  return 0;
9083  }
9084  } else if (level == kmp_hard_paused) { // requesting hard pause
9085  if (__kmp_pause_status != kmp_not_paused) {
9086  // error message about already being paused
9087  return 1;
9088  } else {
9089  __kmp_hard_pause();
9090  return 0;
9091  }
9092  } else {
9093  // error message about invalid level
9094  return 1;
9095  }
9096 }
9097 
9098 void __kmp_omp_display_env(int verbose) {
9099  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9100  if (__kmp_init_serial == 0)
9101  __kmp_do_serial_initialize();
9102  __kmp_display_env_impl(!verbose, verbose);
9103  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9104 }
9105 
9106 // The team size is changing, so distributed barrier must be modified
9107 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9108  int new_nthreads) {
9109  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9110  bp_dist_bar);
9111  kmp_info_t **other_threads = team->t.t_threads;
9112 
9113  // We want all the workers to stop waiting on the barrier while we adjust the
9114  // size of the team.
9115  for (int f = 1; f < old_nthreads; ++f) {
9116  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9117  // Ignore threads that are already inactive or not present in the team
9118  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9119  // teams construct causes thread_limit to get passed in, and some of
9120  // those could be inactive; just ignore them
9121  continue;
9122  }
9123  // If thread is transitioning still to in_use state, wait for it
9124  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9125  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9126  KMP_CPU_PAUSE();
9127  }
9128  // The thread should be in_use now
9129  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9130  // Transition to unused state
9131  team->t.t_threads[f]->th.th_used_in_team.store(2);
9132  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9133  }
9134  // Release all the workers
9135  team->t.b->go_release();
9136 
9137  KMP_MFENCE();
9138 
9139  // Workers should see transition status 2 and move to 0; but may need to be
9140  // woken up first
9141  int count = old_nthreads - 1;
9142  while (count > 0) {
9143  count = old_nthreads - 1;
9144  for (int f = 1; f < old_nthreads; ++f) {
9145  if (other_threads[f]->th.th_used_in_team.load() != 0) {
9146  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9147  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9148  void *, other_threads[f]->th.th_sleep_loc);
9149  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9150  }
9151  } else {
9152  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9153  count--;
9154  }
9155  }
9156  }
9157  // Now update the barrier size
9158  team->t.b->update_num_threads(new_nthreads);
9159  team->t.b->go_reset();
9160 }
9161 
9162 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9163  // Add the threads back to the team
9164  KMP_DEBUG_ASSERT(team);
9165  // Threads were paused and pointed at th_used_in_team temporarily during a
9166  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9167  // the thread that it should transition itself back into the team. Then, if
9168  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9169  // to wake it up.
9170  for (int f = 1; f < new_nthreads; ++f) {
9171  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9172  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9173  3);
9174  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9175  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9176  (kmp_flag_32<false, false> *)NULL);
9177  }
9178  }
9179  // The threads should be transitioning to the team; when they are done, they
9180  // should have set th_used_in_team to 1. This loop forces master to wait until
9181  // all threads have moved into the team and are waiting in the barrier.
9182  int count = new_nthreads - 1;
9183  while (count > 0) {
9184  count = new_nthreads - 1;
9185  for (int f = 1; f < new_nthreads; ++f) {
9186  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9187  count--;
9188  }
9189  }
9190  }
9191 }
9192 
9193 // Globals and functions for hidden helper task
9194 kmp_info_t **__kmp_hidden_helper_threads;
9195 kmp_info_t *__kmp_hidden_helper_main_thread;
9196 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9197 #if KMP_OS_LINUX
9198 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9199 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9200 #else
9201 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9202 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9203 #endif
9204 
9205 namespace {
9206 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9207 
9208 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9209  // This is an explicit synchronization on all hidden helper threads in case
9210  // that when a regular thread pushes a hidden helper task to one hidden
9211  // helper thread, the thread has not been awaken once since they're released
9212  // by the main thread after creating the team.
9213  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9214  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9215  __kmp_hidden_helper_threads_num)
9216  ;
9217 
9218  // If main thread, then wait for signal
9219  if (__kmpc_master(nullptr, *gtid)) {
9220  // First, unset the initial state and release the initial thread
9221  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9222  __kmp_hidden_helper_initz_release();
9223  __kmp_hidden_helper_main_thread_wait();
9224  // Now wake up all worker threads
9225  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9226  __kmp_hidden_helper_worker_thread_signal();
9227  }
9228  }
9229 }
9230 } // namespace
9231 
9232 void __kmp_hidden_helper_threads_initz_routine() {
9233  // Create a new root for hidden helper team/threads
9234  const int gtid = __kmp_register_root(TRUE);
9235  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9236  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9237  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9238  __kmp_hidden_helper_threads_num;
9239 
9240  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9241 
9242  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9243 
9244  // Set the initialization flag to FALSE
9245  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9246 
9247  __kmp_hidden_helper_threads_deinitz_release();
9248 }
9249 
9250 /* Nesting Mode:
9251  Set via KMP_NESTING_MODE, which takes an integer.
9252  Note: we skip duplicate topology levels, and skip levels with only
9253  one entity.
9254  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9255  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9256  in the topology, and initializes the number of threads at each of those
9257  levels to the number of entities at each level, respectively, below the
9258  entity at the parent level.
9259  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9260  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9261  the user to turn nesting on explicitly. This is an even more experimental
9262  option to this experimental feature, and may change or go away in the
9263  future.
9264 */
9265 
9266 // Allocate space to store nesting levels
9267 void __kmp_init_nesting_mode() {
9268  int levels = KMP_HW_LAST;
9269  __kmp_nesting_mode_nlevels = levels;
9270  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9271  for (int i = 0; i < levels; ++i)
9272  __kmp_nesting_nth_level[i] = 0;
9273  if (__kmp_nested_nth.size < levels) {
9274  __kmp_nested_nth.nth =
9275  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9276  __kmp_nested_nth.size = levels;
9277  }
9278 }
9279 
9280 // Set # threads for top levels of nesting; must be called after topology set
9281 void __kmp_set_nesting_mode_threads() {
9282  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9283 
9284  if (__kmp_nesting_mode == 1)
9285  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9286  else if (__kmp_nesting_mode > 1)
9287  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9288 
9289  if (__kmp_topology) { // use topology info
9290  int loc, hw_level;
9291  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9292  loc < __kmp_nesting_mode_nlevels;
9293  loc++, hw_level++) {
9294  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9295  if (__kmp_nesting_nth_level[loc] == 1)
9296  loc--;
9297  }
9298  // Make sure all cores are used
9299  if (__kmp_nesting_mode > 1 && loc > 1) {
9300  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9301  int num_cores = __kmp_topology->get_count(core_level);
9302  int upper_levels = 1;
9303  for (int level = 0; level < loc - 1; ++level)
9304  upper_levels *= __kmp_nesting_nth_level[level];
9305  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9306  __kmp_nesting_nth_level[loc - 1] =
9307  num_cores / __kmp_nesting_nth_level[loc - 2];
9308  }
9309  __kmp_nesting_mode_nlevels = loc;
9310  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9311  } else { // no topology info available; provide a reasonable guesstimation
9312  if (__kmp_avail_proc >= 4) {
9313  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9314  __kmp_nesting_nth_level[1] = 2;
9315  __kmp_nesting_mode_nlevels = 2;
9316  } else {
9317  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9318  __kmp_nesting_mode_nlevels = 1;
9319  }
9320  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9321  }
9322  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9323  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9324  }
9325  set__nproc(thread, __kmp_nesting_nth_level[0]);
9326  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9327  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9328  if (get__max_active_levels(thread) > 1) {
9329  // if max levels was set, set nesting mode levels to same
9330  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9331  }
9332  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9333  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9334 }
9335 
9336 // Empty symbols to export (see exports_so.txt) when feature is disabled
9337 extern "C" {
9338 #if !KMP_STATS_ENABLED
9339 void __kmp_reset_stats() {}
9340 #endif
9341 #if !USE_DEBUGGER
9342 int __kmp_omp_debug_struct_info = FALSE;
9343 int __kmp_debugging = FALSE;
9344 #endif
9345 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9346 void __kmp_itt_fini_ittlib() {}
9347 void __kmp_itt_init_ittlib() {}
9348 #endif
9349 }
9350 
9351 // end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:211
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:369
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:376
@ kmp_sch_static
Definition: kmp.h:372
@ kmp_sch_guided_chunked
Definition: kmp.h:374
Definition: kmp.h:246
kmp_int32 flags
Definition: kmp.h:248