LLVM OpenMP* Runtime Library
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 #include "kmp_str.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
21 #endif
22 #if KMP_USE_HWLOC
23 // Copied from hwloc
24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102
25 #define HWLOC_GROUP_KIND_INTEL_TILE 103
26 #define HWLOC_GROUP_KIND_INTEL_DIE 104
27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220
28 #endif
29 #include <ctype.h>
30 
31 // The machine topology
32 kmp_topology_t *__kmp_topology = nullptr;
33 // KMP_HW_SUBSET environment variable
34 kmp_hw_subset_t *__kmp_hw_subset = nullptr;
35 
36 // Store the real or imagined machine hierarchy here
37 static hierarchy_info machine_hierarchy;
38 
39 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
40 
41 #if KMP_AFFINITY_SUPPORTED
42 // Helper class to see if place lists further restrict the fullMask
43 class kmp_full_mask_modifier_t {
44  kmp_affin_mask_t *mask;
45 
46 public:
47  kmp_full_mask_modifier_t() {
48  KMP_CPU_ALLOC(mask);
49  KMP_CPU_ZERO(mask);
50  }
51  ~kmp_full_mask_modifier_t() {
52  KMP_CPU_FREE(mask);
53  mask = nullptr;
54  }
55  void include(const kmp_affin_mask_t *other) { KMP_CPU_UNION(mask, other); }
56  // If the new full mask is different from the current full mask,
57  // then switch them. Returns true if full mask was affected, false otherwise.
58  bool restrict_to_mask() {
59  // See if the new mask further restricts or changes the full mask
60  if (KMP_CPU_EQUAL(__kmp_affin_fullMask, mask) || KMP_CPU_ISEMPTY(mask))
61  return false;
62  return __kmp_topology->restrict_to_mask(mask);
63  }
64 };
65 
66 static inline const char *
67 __kmp_get_affinity_env_var(const kmp_affinity_t &affinity,
68  bool for_binding = false) {
69  if (affinity.flags.omp_places) {
70  if (for_binding)
71  return "OMP_PROC_BIND";
72  return "OMP_PLACES";
73  }
74  return affinity.env_var;
75 }
76 #endif // KMP_AFFINITY_SUPPORTED
77 
78 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
79  kmp_uint32 depth;
80  // The test below is true if affinity is available, but set to "none". Need to
81  // init on first use of hierarchical barrier.
82  if (TCR_1(machine_hierarchy.uninitialized))
83  machine_hierarchy.init(nproc);
84 
85  // Adjust the hierarchy in case num threads exceeds original
86  if (nproc > machine_hierarchy.base_num_threads)
87  machine_hierarchy.resize(nproc);
88 
89  depth = machine_hierarchy.depth;
90  KMP_DEBUG_ASSERT(depth > 0);
91 
92  thr_bar->depth = depth;
93  __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1,
94  &(thr_bar->base_leaf_kids));
95  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
96 }
97 
98 static int nCoresPerPkg, nPackages;
99 static int __kmp_nThreadsPerCore;
100 #ifndef KMP_DFLT_NTH_CORES
101 static int __kmp_ncores;
102 #endif
103 
104 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
105  switch (type) {
106  case KMP_HW_SOCKET:
107  return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
108  case KMP_HW_DIE:
109  return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
110  case KMP_HW_MODULE:
111  return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
112  case KMP_HW_TILE:
113  return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
114  case KMP_HW_NUMA:
115  return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
116  case KMP_HW_L3:
117  return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
118  case KMP_HW_L2:
119  return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
120  case KMP_HW_L1:
121  return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
122  case KMP_HW_LLC:
123  return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache));
124  case KMP_HW_CORE:
125  return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
126  case KMP_HW_THREAD:
127  return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
128  case KMP_HW_PROC_GROUP:
129  return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
130  case KMP_HW_UNKNOWN:
131  case KMP_HW_LAST:
132  return KMP_I18N_STR(Unknown);
133  }
134  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
135  KMP_BUILTIN_UNREACHABLE;
136 }
137 
138 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
139  switch (type) {
140  case KMP_HW_SOCKET:
141  return ((plural) ? "sockets" : "socket");
142  case KMP_HW_DIE:
143  return ((plural) ? "dice" : "die");
144  case KMP_HW_MODULE:
145  return ((plural) ? "modules" : "module");
146  case KMP_HW_TILE:
147  return ((plural) ? "tiles" : "tile");
148  case KMP_HW_NUMA:
149  return ((plural) ? "numa_domains" : "numa_domain");
150  case KMP_HW_L3:
151  return ((plural) ? "l3_caches" : "l3_cache");
152  case KMP_HW_L2:
153  return ((plural) ? "l2_caches" : "l2_cache");
154  case KMP_HW_L1:
155  return ((plural) ? "l1_caches" : "l1_cache");
156  case KMP_HW_LLC:
157  return ((plural) ? "ll_caches" : "ll_cache");
158  case KMP_HW_CORE:
159  return ((plural) ? "cores" : "core");
160  case KMP_HW_THREAD:
161  return ((plural) ? "threads" : "thread");
162  case KMP_HW_PROC_GROUP:
163  return ((plural) ? "proc_groups" : "proc_group");
164  case KMP_HW_UNKNOWN:
165  case KMP_HW_LAST:
166  return ((plural) ? "unknowns" : "unknown");
167  }
168  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
169  KMP_BUILTIN_UNREACHABLE;
170 }
171 
172 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
173  switch (type) {
174  case KMP_HW_CORE_TYPE_UNKNOWN:
175  case KMP_HW_MAX_NUM_CORE_TYPES:
176  return "unknown";
177 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
178  case KMP_HW_CORE_TYPE_ATOM:
179  return "Intel Atom(R) processor";
180  case KMP_HW_CORE_TYPE_CORE:
181  return "Intel(R) Core(TM) processor";
182 #endif
183  }
184  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
185  KMP_BUILTIN_UNREACHABLE;
186 }
187 
188 #if KMP_AFFINITY_SUPPORTED
189 // If affinity is supported, check the affinity
190 // verbose and warning flags before printing warning
191 #define KMP_AFF_WARNING(s, ...) \
192  if (s.flags.verbose || (s.flags.warnings && (s.type != affinity_none))) { \
193  KMP_WARNING(__VA_ARGS__); \
194  }
195 #else
196 #define KMP_AFF_WARNING(s, ...) KMP_WARNING(__VA_ARGS__)
197 #endif
198 
200 // kmp_hw_thread_t methods
201 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) {
202  const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a;
203  const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b;
204  int depth = __kmp_topology->get_depth();
205  for (int level = 0; level < depth; ++level) {
206  if (ahwthread->ids[level] < bhwthread->ids[level])
207  return -1;
208  else if (ahwthread->ids[level] > bhwthread->ids[level])
209  return 1;
210  }
211  if (ahwthread->os_id < bhwthread->os_id)
212  return -1;
213  else if (ahwthread->os_id > bhwthread->os_id)
214  return 1;
215  return 0;
216 }
217 
218 #if KMP_AFFINITY_SUPPORTED
219 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) {
220  int i;
221  const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a;
222  const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b;
223  int depth = __kmp_topology->get_depth();
224  int compact = __kmp_topology->compact;
225  KMP_DEBUG_ASSERT(compact >= 0);
226  KMP_DEBUG_ASSERT(compact <= depth);
227  for (i = 0; i < compact; i++) {
228  int j = depth - i - 1;
229  if (aa->sub_ids[j] < bb->sub_ids[j])
230  return -1;
231  if (aa->sub_ids[j] > bb->sub_ids[j])
232  return 1;
233  }
234  for (; i < depth; i++) {
235  int j = i - compact;
236  if (aa->sub_ids[j] < bb->sub_ids[j])
237  return -1;
238  if (aa->sub_ids[j] > bb->sub_ids[j])
239  return 1;
240  }
241  return 0;
242 }
243 #endif
244 
245 void kmp_hw_thread_t::print() const {
246  int depth = __kmp_topology->get_depth();
247  printf("%4d ", os_id);
248  for (int i = 0; i < depth; ++i) {
249  printf("%4d ", ids[i]);
250  }
251  if (attrs) {
252  if (attrs.is_core_type_valid())
253  printf(" (%s)", __kmp_hw_get_core_type_string(attrs.get_core_type()));
254  if (attrs.is_core_eff_valid())
255  printf(" (eff=%d)", attrs.get_core_eff());
256  }
257  if (leader)
258  printf(" (leader)");
259  printf("\n");
260 }
261 
263 // kmp_topology_t methods
264 
265 // Add a layer to the topology based on the ids. Assume the topology
266 // is perfectly nested (i.e., so no object has more than one parent)
267 void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) {
268  // Figure out where the layer should go by comparing the ids of the current
269  // layers with the new ids
270  int target_layer;
271  int previous_id = kmp_hw_thread_t::UNKNOWN_ID;
272  int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID;
273 
274  // Start from the highest layer and work down to find target layer
275  // If new layer is equal to another layer then put the new layer above
276  for (target_layer = 0; target_layer < depth; ++target_layer) {
277  bool layers_equal = true;
278  bool strictly_above_target_layer = false;
279  for (int i = 0; i < num_hw_threads; ++i) {
280  int id = hw_threads[i].ids[target_layer];
281  int new_id = ids[i];
282  if (id != previous_id && new_id == previous_new_id) {
283  // Found the layer we are strictly above
284  strictly_above_target_layer = true;
285  layers_equal = false;
286  break;
287  } else if (id == previous_id && new_id != previous_new_id) {
288  // Found a layer we are below. Move to next layer and check.
289  layers_equal = false;
290  break;
291  }
292  previous_id = id;
293  previous_new_id = new_id;
294  }
295  if (strictly_above_target_layer || layers_equal)
296  break;
297  }
298 
299  // Found the layer we are above. Now move everything to accommodate the new
300  // layer. And put the new ids and type into the topology.
301  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
302  types[j] = types[i];
303  types[target_layer] = type;
304  for (int k = 0; k < num_hw_threads; ++k) {
305  for (int i = depth - 1, j = depth; i >= target_layer; --i, --j)
306  hw_threads[k].ids[j] = hw_threads[k].ids[i];
307  hw_threads[k].ids[target_layer] = ids[k];
308  }
309  equivalent[type] = type;
310  depth++;
311 }
312 
313 #if KMP_GROUP_AFFINITY
314 // Insert the Windows Processor Group structure into the topology
315 void kmp_topology_t::_insert_windows_proc_groups() {
316  // Do not insert the processor group structure for a single group
317  if (__kmp_num_proc_groups == 1)
318  return;
319  kmp_affin_mask_t *mask;
320  int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads);
321  KMP_CPU_ALLOC(mask);
322  for (int i = 0; i < num_hw_threads; ++i) {
323  KMP_CPU_ZERO(mask);
324  KMP_CPU_SET(hw_threads[i].os_id, mask);
325  ids[i] = __kmp_get_proc_group(mask);
326  }
327  KMP_CPU_FREE(mask);
328  _insert_layer(KMP_HW_PROC_GROUP, ids);
329  __kmp_free(ids);
330 }
331 #endif
332 
333 // Remove layers that don't add information to the topology.
334 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
335 void kmp_topology_t::_remove_radix1_layers() {
336  int preference[KMP_HW_LAST];
337  int top_index1, top_index2;
338  // Set up preference associative array
339  preference[KMP_HW_SOCKET] = 110;
340  preference[KMP_HW_PROC_GROUP] = 100;
341  preference[KMP_HW_CORE] = 95;
342  preference[KMP_HW_THREAD] = 90;
343  preference[KMP_HW_NUMA] = 85;
344  preference[KMP_HW_DIE] = 80;
345  preference[KMP_HW_TILE] = 75;
346  preference[KMP_HW_MODULE] = 73;
347  preference[KMP_HW_L3] = 70;
348  preference[KMP_HW_L2] = 65;
349  preference[KMP_HW_L1] = 60;
350  preference[KMP_HW_LLC] = 5;
351  top_index1 = 0;
352  top_index2 = 1;
353  while (top_index1 < depth - 1 && top_index2 < depth) {
354  kmp_hw_t type1 = types[top_index1];
355  kmp_hw_t type2 = types[top_index2];
356  KMP_ASSERT_VALID_HW_TYPE(type1);
357  KMP_ASSERT_VALID_HW_TYPE(type2);
358  // Do not allow the three main topology levels (sockets, cores, threads) to
359  // be compacted down
360  if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE ||
361  type1 == KMP_HW_SOCKET) &&
362  (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE ||
363  type2 == KMP_HW_SOCKET)) {
364  top_index1 = top_index2++;
365  continue;
366  }
367  bool radix1 = true;
368  bool all_same = true;
369  int id1 = hw_threads[0].ids[top_index1];
370  int id2 = hw_threads[0].ids[top_index2];
371  int pref1 = preference[type1];
372  int pref2 = preference[type2];
373  for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) {
374  if (hw_threads[hwidx].ids[top_index1] == id1 &&
375  hw_threads[hwidx].ids[top_index2] != id2) {
376  radix1 = false;
377  break;
378  }
379  if (hw_threads[hwidx].ids[top_index2] != id2)
380  all_same = false;
381  id1 = hw_threads[hwidx].ids[top_index1];
382  id2 = hw_threads[hwidx].ids[top_index2];
383  }
384  if (radix1) {
385  // Select the layer to remove based on preference
386  kmp_hw_t remove_type, keep_type;
387  int remove_layer, remove_layer_ids;
388  if (pref1 > pref2) {
389  remove_type = type2;
390  remove_layer = remove_layer_ids = top_index2;
391  keep_type = type1;
392  } else {
393  remove_type = type1;
394  remove_layer = remove_layer_ids = top_index1;
395  keep_type = type2;
396  }
397  // If all the indexes for the second (deeper) layer are the same.
398  // e.g., all are zero, then make sure to keep the first layer's ids
399  if (all_same)
400  remove_layer_ids = top_index2;
401  // Remove radix one type by setting the equivalence, removing the id from
402  // the hw threads and removing the layer from types and depth
403  set_equivalent_type(remove_type, keep_type);
404  for (int idx = 0; idx < num_hw_threads; ++idx) {
405  kmp_hw_thread_t &hw_thread = hw_threads[idx];
406  for (int d = remove_layer_ids; d < depth - 1; ++d)
407  hw_thread.ids[d] = hw_thread.ids[d + 1];
408  }
409  for (int idx = remove_layer; idx < depth - 1; ++idx)
410  types[idx] = types[idx + 1];
411  depth--;
412  } else {
413  top_index1 = top_index2++;
414  }
415  }
416  KMP_ASSERT(depth > 0);
417 }
418 
419 void kmp_topology_t::_set_last_level_cache() {
420  if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN)
421  set_equivalent_type(KMP_HW_LLC, KMP_HW_L3);
422  else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
423  set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
424 #if KMP_MIC_SUPPORTED
425  else if (__kmp_mic_type == mic3) {
426  if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN)
427  set_equivalent_type(KMP_HW_LLC, KMP_HW_L2);
428  else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN)
429  set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE);
430  // L2/Tile wasn't detected so just say L1
431  else
432  set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
433  }
434 #endif
435  else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN)
436  set_equivalent_type(KMP_HW_LLC, KMP_HW_L1);
437  // Fallback is to set last level cache to socket or core
438  if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) {
439  if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN)
440  set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET);
441  else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN)
442  set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE);
443  }
444  KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN);
445 }
446 
447 // Gather the count of each topology layer and the ratio
448 void kmp_topology_t::_gather_enumeration_information() {
449  int previous_id[KMP_HW_LAST];
450  int max[KMP_HW_LAST];
451 
452  for (int i = 0; i < depth; ++i) {
453  previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
454  max[i] = 0;
455  count[i] = 0;
456  ratio[i] = 0;
457  }
458  int core_level = get_level(KMP_HW_CORE);
459  for (int i = 0; i < num_hw_threads; ++i) {
460  kmp_hw_thread_t &hw_thread = hw_threads[i];
461  for (int layer = 0; layer < depth; ++layer) {
462  int id = hw_thread.ids[layer];
463  if (id != previous_id[layer]) {
464  // Add an additional increment to each count
465  for (int l = layer; l < depth; ++l)
466  count[l]++;
467  // Keep track of topology layer ratio statistics
468  max[layer]++;
469  for (int l = layer + 1; l < depth; ++l) {
470  if (max[l] > ratio[l])
471  ratio[l] = max[l];
472  max[l] = 1;
473  }
474  // Figure out the number of different core types
475  // and efficiencies for hybrid CPUs
476  if (__kmp_is_hybrid_cpu() && core_level >= 0 && layer <= core_level) {
477  if (hw_thread.attrs.is_core_eff_valid() &&
478  hw_thread.attrs.core_eff >= num_core_efficiencies) {
479  // Because efficiencies can range from 0 to max efficiency - 1,
480  // the number of efficiencies is max efficiency + 1
481  num_core_efficiencies = hw_thread.attrs.core_eff + 1;
482  }
483  if (hw_thread.attrs.is_core_type_valid()) {
484  bool found = false;
485  for (int j = 0; j < num_core_types; ++j) {
486  if (hw_thread.attrs.get_core_type() == core_types[j]) {
487  found = true;
488  break;
489  }
490  }
491  if (!found) {
492  KMP_ASSERT(num_core_types < KMP_HW_MAX_NUM_CORE_TYPES);
493  core_types[num_core_types++] = hw_thread.attrs.get_core_type();
494  }
495  }
496  }
497  break;
498  }
499  }
500  for (int layer = 0; layer < depth; ++layer) {
501  previous_id[layer] = hw_thread.ids[layer];
502  }
503  }
504  for (int layer = 0; layer < depth; ++layer) {
505  if (max[layer] > ratio[layer])
506  ratio[layer] = max[layer];
507  }
508 }
509 
510 int kmp_topology_t::_get_ncores_with_attr(const kmp_hw_attr_t &attr,
511  int above_level,
512  bool find_all) const {
513  int current, current_max;
514  int previous_id[KMP_HW_LAST];
515  for (int i = 0; i < depth; ++i)
516  previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID;
517  int core_level = get_level(KMP_HW_CORE);
518  if (find_all)
519  above_level = -1;
520  KMP_ASSERT(above_level < core_level);
521  current_max = 0;
522  current = 0;
523  for (int i = 0; i < num_hw_threads; ++i) {
524  kmp_hw_thread_t &hw_thread = hw_threads[i];
525  if (!find_all && hw_thread.ids[above_level] != previous_id[above_level]) {
526  if (current > current_max)
527  current_max = current;
528  current = hw_thread.attrs.contains(attr);
529  } else {
530  for (int level = above_level + 1; level <= core_level; ++level) {
531  if (hw_thread.ids[level] != previous_id[level]) {
532  if (hw_thread.attrs.contains(attr))
533  current++;
534  break;
535  }
536  }
537  }
538  for (int level = 0; level < depth; ++level)
539  previous_id[level] = hw_thread.ids[level];
540  }
541  if (current > current_max)
542  current_max = current;
543  return current_max;
544 }
545 
546 // Find out if the topology is uniform
547 void kmp_topology_t::_discover_uniformity() {
548  int num = 1;
549  for (int level = 0; level < depth; ++level)
550  num *= ratio[level];
551  flags.uniform = (num == count[depth - 1]);
552 }
553 
554 // Set all the sub_ids for each hardware thread
555 void kmp_topology_t::_set_sub_ids() {
556  int previous_id[KMP_HW_LAST];
557  int sub_id[KMP_HW_LAST];
558 
559  for (int i = 0; i < depth; ++i) {
560  previous_id[i] = -1;
561  sub_id[i] = -1;
562  }
563  for (int i = 0; i < num_hw_threads; ++i) {
564  kmp_hw_thread_t &hw_thread = hw_threads[i];
565  // Setup the sub_id
566  for (int j = 0; j < depth; ++j) {
567  if (hw_thread.ids[j] != previous_id[j]) {
568  sub_id[j]++;
569  for (int k = j + 1; k < depth; ++k) {
570  sub_id[k] = 0;
571  }
572  break;
573  }
574  }
575  // Set previous_id
576  for (int j = 0; j < depth; ++j) {
577  previous_id[j] = hw_thread.ids[j];
578  }
579  // Set the sub_ids field
580  for (int j = 0; j < depth; ++j) {
581  hw_thread.sub_ids[j] = sub_id[j];
582  }
583  }
584 }
585 
586 void kmp_topology_t::_set_globals() {
587  // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores
588  int core_level, thread_level, package_level;
589  package_level = get_level(KMP_HW_SOCKET);
590 #if KMP_GROUP_AFFINITY
591  if (package_level == -1)
592  package_level = get_level(KMP_HW_PROC_GROUP);
593 #endif
594  core_level = get_level(KMP_HW_CORE);
595  thread_level = get_level(KMP_HW_THREAD);
596 
597  KMP_ASSERT(core_level != -1);
598  KMP_ASSERT(thread_level != -1);
599 
600  __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level);
601  if (package_level != -1) {
602  nCoresPerPkg = calculate_ratio(core_level, package_level);
603  nPackages = get_count(package_level);
604  } else {
605  // assume one socket
606  nCoresPerPkg = get_count(core_level);
607  nPackages = 1;
608  }
609 #ifndef KMP_DFLT_NTH_CORES
610  __kmp_ncores = get_count(core_level);
611 #endif
612 }
613 
614 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth,
615  const kmp_hw_t *types) {
616  kmp_topology_t *retval;
617  // Allocate all data in one large allocation
618  size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc +
619  sizeof(int) * (size_t)KMP_HW_LAST * 3;
620  char *bytes = (char *)__kmp_allocate(size);
621  retval = (kmp_topology_t *)bytes;
622  if (nproc > 0) {
623  retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t));
624  } else {
625  retval->hw_threads = nullptr;
626  }
627  retval->num_hw_threads = nproc;
628  retval->depth = ndepth;
629  int *arr =
630  (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc);
631  retval->types = (kmp_hw_t *)arr;
632  retval->ratio = arr + (size_t)KMP_HW_LAST;
633  retval->count = arr + 2 * (size_t)KMP_HW_LAST;
634  retval->num_core_efficiencies = 0;
635  retval->num_core_types = 0;
636  retval->compact = 0;
637  for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i)
638  retval->core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN;
639  KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; }
640  for (int i = 0; i < ndepth; ++i) {
641  retval->types[i] = types[i];
642  retval->equivalent[types[i]] = types[i];
643  }
644  return retval;
645 }
646 
647 void kmp_topology_t::deallocate(kmp_topology_t *topology) {
648  if (topology)
649  __kmp_free(topology);
650 }
651 
652 bool kmp_topology_t::check_ids() const {
653  // Assume ids have been sorted
654  if (num_hw_threads == 0)
655  return true;
656  for (int i = 1; i < num_hw_threads; ++i) {
657  kmp_hw_thread_t &current_thread = hw_threads[i];
658  kmp_hw_thread_t &previous_thread = hw_threads[i - 1];
659  bool unique = false;
660  for (int j = 0; j < depth; ++j) {
661  if (previous_thread.ids[j] != current_thread.ids[j]) {
662  unique = true;
663  break;
664  }
665  }
666  if (unique)
667  continue;
668  return false;
669  }
670  return true;
671 }
672 
673 void kmp_topology_t::dump() const {
674  printf("***********************\n");
675  printf("*** __kmp_topology: ***\n");
676  printf("***********************\n");
677  printf("* depth: %d\n", depth);
678 
679  printf("* types: ");
680  for (int i = 0; i < depth; ++i)
681  printf("%15s ", __kmp_hw_get_keyword(types[i]));
682  printf("\n");
683 
684  printf("* ratio: ");
685  for (int i = 0; i < depth; ++i) {
686  printf("%15d ", ratio[i]);
687  }
688  printf("\n");
689 
690  printf("* count: ");
691  for (int i = 0; i < depth; ++i) {
692  printf("%15d ", count[i]);
693  }
694  printf("\n");
695 
696  printf("* num_core_eff: %d\n", num_core_efficiencies);
697  printf("* num_core_types: %d\n", num_core_types);
698  printf("* core_types: ");
699  for (int i = 0; i < num_core_types; ++i)
700  printf("%3d ", core_types[i]);
701  printf("\n");
702 
703  printf("* equivalent map:\n");
704  KMP_FOREACH_HW_TYPE(i) {
705  const char *key = __kmp_hw_get_keyword(i);
706  const char *value = __kmp_hw_get_keyword(equivalent[i]);
707  printf("%-15s -> %-15s\n", key, value);
708  }
709 
710  printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No"));
711 
712  printf("* num_hw_threads: %d\n", num_hw_threads);
713  printf("* hw_threads:\n");
714  for (int i = 0; i < num_hw_threads; ++i) {
715  hw_threads[i].print();
716  }
717  printf("***********************\n");
718 }
719 
720 void kmp_topology_t::print(const char *env_var) const {
721  kmp_str_buf_t buf;
722  int print_types_depth;
723  __kmp_str_buf_init(&buf);
724  kmp_hw_t print_types[KMP_HW_LAST + 2];
725 
726  // Num Available Threads
727  if (num_hw_threads) {
728  KMP_INFORM(AvailableOSProc, env_var, num_hw_threads);
729  } else {
730  KMP_INFORM(AvailableOSProc, env_var, __kmp_xproc);
731  }
732 
733  // Uniform or not
734  if (is_uniform()) {
735  KMP_INFORM(Uniform, env_var);
736  } else {
737  KMP_INFORM(NonUniform, env_var);
738  }
739 
740  // Equivalent types
741  KMP_FOREACH_HW_TYPE(type) {
742  kmp_hw_t eq_type = equivalent[type];
743  if (eq_type != KMP_HW_UNKNOWN && eq_type != type) {
744  KMP_INFORM(AffEqualTopologyTypes, env_var,
745  __kmp_hw_get_catalog_string(type),
746  __kmp_hw_get_catalog_string(eq_type));
747  }
748  }
749 
750  // Quick topology
751  KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST);
752  // Create a print types array that always guarantees printing
753  // the core and thread level
754  print_types_depth = 0;
755  for (int level = 0; level < depth; ++level)
756  print_types[print_types_depth++] = types[level];
757  if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) {
758  // Force in the core level for quick topology
759  if (print_types[print_types_depth - 1] == KMP_HW_THREAD) {
760  // Force core before thread e.g., 1 socket X 2 threads/socket
761  // becomes 1 socket X 1 core/socket X 2 threads/socket
762  print_types[print_types_depth - 1] = KMP_HW_CORE;
763  print_types[print_types_depth++] = KMP_HW_THREAD;
764  } else {
765  print_types[print_types_depth++] = KMP_HW_CORE;
766  }
767  }
768  // Always put threads at very end of quick topology
769  if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD)
770  print_types[print_types_depth++] = KMP_HW_THREAD;
771 
772  __kmp_str_buf_clear(&buf);
773  kmp_hw_t numerator_type;
774  kmp_hw_t denominator_type = KMP_HW_UNKNOWN;
775  int core_level = get_level(KMP_HW_CORE);
776  int ncores = get_count(core_level);
777 
778  for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) {
779  int c;
780  bool plural;
781  numerator_type = print_types[plevel];
782  KMP_ASSERT_VALID_HW_TYPE(numerator_type);
783  if (equivalent[numerator_type] != numerator_type)
784  c = 1;
785  else
786  c = get_ratio(level++);
787  plural = (c > 1);
788  if (plevel == 0) {
789  __kmp_str_buf_print(&buf, "%d %s", c,
790  __kmp_hw_get_catalog_string(numerator_type, plural));
791  } else {
792  __kmp_str_buf_print(&buf, " x %d %s/%s", c,
793  __kmp_hw_get_catalog_string(numerator_type, plural),
794  __kmp_hw_get_catalog_string(denominator_type));
795  }
796  denominator_type = numerator_type;
797  }
798  KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores);
799 
800  // Hybrid topology information
801  if (__kmp_is_hybrid_cpu()) {
802  for (int i = 0; i < num_core_types; ++i) {
803  kmp_hw_core_type_t core_type = core_types[i];
804  kmp_hw_attr_t attr;
805  attr.clear();
806  attr.set_core_type(core_type);
807  int ncores = get_ncores_with_attr(attr);
808  if (ncores > 0) {
809  KMP_INFORM(TopologyHybrid, env_var, ncores,
810  __kmp_hw_get_core_type_string(core_type));
811  KMP_ASSERT(num_core_efficiencies <= KMP_HW_MAX_NUM_CORE_EFFS)
812  for (int eff = 0; eff < num_core_efficiencies; ++eff) {
813  attr.set_core_eff(eff);
814  int ncores_with_eff = get_ncores_with_attr(attr);
815  if (ncores_with_eff > 0) {
816  KMP_INFORM(TopologyHybridCoreEff, env_var, ncores_with_eff, eff);
817  }
818  }
819  }
820  }
821  }
822 
823  if (num_hw_threads <= 0) {
824  __kmp_str_buf_free(&buf);
825  return;
826  }
827 
828  // Full OS proc to hardware thread map
829  KMP_INFORM(OSProcToPhysicalThreadMap, env_var);
830  for (int i = 0; i < num_hw_threads; i++) {
831  __kmp_str_buf_clear(&buf);
832  for (int level = 0; level < depth; ++level) {
833  kmp_hw_t type = types[level];
834  __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type));
835  __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]);
836  }
837  if (__kmp_is_hybrid_cpu())
838  __kmp_str_buf_print(
839  &buf, "(%s)",
840  __kmp_hw_get_core_type_string(hw_threads[i].attrs.get_core_type()));
841  KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str);
842  }
843 
844  __kmp_str_buf_free(&buf);
845 }
846 
847 #if KMP_AFFINITY_SUPPORTED
848 void kmp_topology_t::set_granularity(kmp_affinity_t &affinity) const {
849  const char *env_var = __kmp_get_affinity_env_var(affinity);
850  // If requested hybrid CPU attributes for granularity (either OMP_PLACES or
851  // KMP_AFFINITY), but none exist, then reset granularity and have below method
852  // select a granularity and warn user.
853  if (!__kmp_is_hybrid_cpu()) {
854  if (affinity.core_attr_gran.valid) {
855  // OMP_PLACES with cores:<attribute> but non-hybrid arch, use cores
856  // instead
857  KMP_AFF_WARNING(
858  affinity, AffIgnoringNonHybrid, env_var,
859  __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
860  affinity.gran = KMP_HW_CORE;
861  affinity.gran_levels = -1;
862  affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
863  affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
864  } else if (affinity.flags.core_types_gran ||
865  affinity.flags.core_effs_gran) {
866  // OMP_PLACES=core_types|core_effs but non-hybrid, use cores instead
867  if (affinity.flags.omp_places) {
868  KMP_AFF_WARNING(
869  affinity, AffIgnoringNonHybrid, env_var,
870  __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true));
871  } else {
872  // KMP_AFFINITY=granularity=core_type|core_eff,...
873  KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
874  "Intel(R) Hybrid Technology core attribute",
875  __kmp_hw_get_catalog_string(KMP_HW_CORE));
876  }
877  affinity.gran = KMP_HW_CORE;
878  affinity.gran_levels = -1;
879  affinity.core_attr_gran = KMP_AFFINITY_ATTRS_UNKNOWN;
880  affinity.flags.core_types_gran = affinity.flags.core_effs_gran = 0;
881  }
882  }
883  // Set the number of affinity granularity levels
884  if (affinity.gran_levels < 0) {
885  kmp_hw_t gran_type = get_equivalent_type(affinity.gran);
886  // Check if user's granularity request is valid
887  if (gran_type == KMP_HW_UNKNOWN) {
888  // First try core, then thread, then package
889  kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET};
890  for (auto g : gran_types) {
891  if (get_equivalent_type(g) != KMP_HW_UNKNOWN) {
892  gran_type = g;
893  break;
894  }
895  }
896  KMP_ASSERT(gran_type != KMP_HW_UNKNOWN);
897  // Warn user what granularity setting will be used instead
898  KMP_AFF_WARNING(affinity, AffGranularityBad, env_var,
899  __kmp_hw_get_catalog_string(affinity.gran),
900  __kmp_hw_get_catalog_string(gran_type));
901  affinity.gran = gran_type;
902  }
903 #if KMP_GROUP_AFFINITY
904  // If more than one processor group exists, and the level of
905  // granularity specified by the user is too coarse, then the
906  // granularity must be adjusted "down" to processor group affinity
907  // because threads can only exist within one processor group.
908  // For example, if a user sets granularity=socket and there are two
909  // processor groups that cover a socket, then the runtime must
910  // restrict the granularity down to the processor group level.
911  if (__kmp_num_proc_groups > 1) {
912  int gran_depth = get_level(gran_type);
913  int proc_group_depth = get_level(KMP_HW_PROC_GROUP);
914  if (gran_depth >= 0 && proc_group_depth >= 0 &&
915  gran_depth < proc_group_depth) {
916  KMP_AFF_WARNING(affinity, AffGranTooCoarseProcGroup, env_var,
917  __kmp_hw_get_catalog_string(affinity.gran));
918  affinity.gran = gran_type = KMP_HW_PROC_GROUP;
919  }
920  }
921 #endif
922  affinity.gran_levels = 0;
923  for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i)
924  affinity.gran_levels++;
925  }
926 }
927 #endif
928 
929 void kmp_topology_t::canonicalize() {
930 #if KMP_GROUP_AFFINITY
931  _insert_windows_proc_groups();
932 #endif
933  _remove_radix1_layers();
934  _gather_enumeration_information();
935  _discover_uniformity();
936  _set_sub_ids();
937  _set_globals();
938  _set_last_level_cache();
939 
940 #if KMP_MIC_SUPPORTED
941  // Manually Add L2 = Tile equivalence
942  if (__kmp_mic_type == mic3) {
943  if (get_level(KMP_HW_L2) != -1)
944  set_equivalent_type(KMP_HW_TILE, KMP_HW_L2);
945  else if (get_level(KMP_HW_TILE) != -1)
946  set_equivalent_type(KMP_HW_L2, KMP_HW_TILE);
947  }
948 #endif
949 
950  // Perform post canonicalization checking
951  KMP_ASSERT(depth > 0);
952  for (int level = 0; level < depth; ++level) {
953  // All counts, ratios, and types must be valid
954  KMP_ASSERT(count[level] > 0 && ratio[level] > 0);
955  KMP_ASSERT_VALID_HW_TYPE(types[level]);
956  // Detected types must point to themselves
957  KMP_ASSERT(equivalent[types[level]] == types[level]);
958  }
959 }
960 
961 // Canonicalize an explicit packages X cores/pkg X threads/core topology
962 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg,
963  int nthreads_per_core, int ncores) {
964  int ndepth = 3;
965  depth = ndepth;
966  KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; }
967  for (int level = 0; level < depth; ++level) {
968  count[level] = 0;
969  ratio[level] = 0;
970  }
971  count[0] = npackages;
972  count[1] = ncores;
973  count[2] = __kmp_xproc;
974  ratio[0] = npackages;
975  ratio[1] = ncores_per_pkg;
976  ratio[2] = nthreads_per_core;
977  equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET;
978  equivalent[KMP_HW_CORE] = KMP_HW_CORE;
979  equivalent[KMP_HW_THREAD] = KMP_HW_THREAD;
980  types[0] = KMP_HW_SOCKET;
981  types[1] = KMP_HW_CORE;
982  types[2] = KMP_HW_THREAD;
983  //__kmp_avail_proc = __kmp_xproc;
984  _discover_uniformity();
985 }
986 
987 // Represents running sub IDs for a single core attribute where
988 // attribute values have SIZE possibilities.
989 template <size_t SIZE, typename IndexFunc> struct kmp_sub_ids_t {
990  int last_level; // last level in topology to consider for sub_ids
991  int sub_id[SIZE]; // The sub ID for a given attribute value
992  int prev_sub_id[KMP_HW_LAST];
993  IndexFunc indexer;
994 
995 public:
996  kmp_sub_ids_t(int last_level) : last_level(last_level) {
997  KMP_ASSERT(last_level < KMP_HW_LAST);
998  for (size_t i = 0; i < SIZE; ++i)
999  sub_id[i] = -1;
1000  for (size_t i = 0; i < KMP_HW_LAST; ++i)
1001  prev_sub_id[i] = -1;
1002  }
1003  void update(const kmp_hw_thread_t &hw_thread) {
1004  int idx = indexer(hw_thread);
1005  KMP_ASSERT(idx < (int)SIZE);
1006  for (int level = 0; level <= last_level; ++level) {
1007  if (hw_thread.sub_ids[level] != prev_sub_id[level]) {
1008  if (level < last_level)
1009  sub_id[idx] = -1;
1010  sub_id[idx]++;
1011  break;
1012  }
1013  }
1014  for (int level = 0; level <= last_level; ++level)
1015  prev_sub_id[level] = hw_thread.sub_ids[level];
1016  }
1017  int get_sub_id(const kmp_hw_thread_t &hw_thread) const {
1018  return sub_id[indexer(hw_thread)];
1019  }
1020 };
1021 
1022 #if KMP_AFFINITY_SUPPORTED
1023 static kmp_str_buf_t *
1024 __kmp_hw_get_catalog_core_string(const kmp_hw_attr_t &attr, kmp_str_buf_t *buf,
1025  bool plural) {
1026  __kmp_str_buf_init(buf);
1027  if (attr.is_core_type_valid())
1028  __kmp_str_buf_print(buf, "%s %s",
1029  __kmp_hw_get_core_type_string(attr.get_core_type()),
1030  __kmp_hw_get_catalog_string(KMP_HW_CORE, plural));
1031  else
1032  __kmp_str_buf_print(buf, "%s eff=%d",
1033  __kmp_hw_get_catalog_string(KMP_HW_CORE, plural),
1034  attr.get_core_eff());
1035  return buf;
1036 }
1037 
1038 bool kmp_topology_t::restrict_to_mask(const kmp_affin_mask_t *mask) {
1039  // Apply the filter
1040  bool affected;
1041  int new_index = 0;
1042  for (int i = 0; i < num_hw_threads; ++i) {
1043  int os_id = hw_threads[i].os_id;
1044  if (KMP_CPU_ISSET(os_id, mask)) {
1045  if (i != new_index)
1046  hw_threads[new_index] = hw_threads[i];
1047  new_index++;
1048  } else {
1049  KMP_CPU_CLR(os_id, __kmp_affin_fullMask);
1050  __kmp_avail_proc--;
1051  }
1052  }
1053 
1054  KMP_DEBUG_ASSERT(new_index <= num_hw_threads);
1055  affected = (num_hw_threads != new_index);
1056  num_hw_threads = new_index;
1057 
1058  // Post hardware subset canonicalization
1059  if (affected) {
1060  _gather_enumeration_information();
1061  _discover_uniformity();
1062  _set_globals();
1063  _set_last_level_cache();
1064 #if KMP_OS_WINDOWS
1065  // Copy filtered full mask if topology has single processor group
1066  if (__kmp_num_proc_groups <= 1)
1067 #endif
1068  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
1069  }
1070  return affected;
1071 }
1072 
1073 // Apply the KMP_HW_SUBSET envirable to the topology
1074 // Returns true if KMP_HW_SUBSET filtered any processors
1075 // otherwise, returns false
1076 bool kmp_topology_t::filter_hw_subset() {
1077  // If KMP_HW_SUBSET wasn't requested, then do nothing.
1078  if (!__kmp_hw_subset)
1079  return false;
1080 
1081  // First, sort the KMP_HW_SUBSET items by the machine topology
1082  __kmp_hw_subset->sort();
1083 
1084  // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology
1085  bool using_core_types = false;
1086  bool using_core_effs = false;
1087  int hw_subset_depth = __kmp_hw_subset->get_depth();
1088  kmp_hw_t specified[KMP_HW_LAST];
1089  int *topology_levels = (int *)KMP_ALLOCA(sizeof(int) * hw_subset_depth);
1090  KMP_ASSERT(hw_subset_depth > 0);
1091  KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; }
1092  int core_level = get_level(KMP_HW_CORE);
1093  for (int i = 0; i < hw_subset_depth; ++i) {
1094  int max_count;
1095  const kmp_hw_subset_t::item_t &item = __kmp_hw_subset->at(i);
1096  int num = item.num[0];
1097  int offset = item.offset[0];
1098  kmp_hw_t type = item.type;
1099  kmp_hw_t equivalent_type = equivalent[type];
1100  int level = get_level(type);
1101  topology_levels[i] = level;
1102 
1103  // Check to see if current layer is in detected machine topology
1104  if (equivalent_type != KMP_HW_UNKNOWN) {
1105  __kmp_hw_subset->at(i).type = equivalent_type;
1106  } else {
1107  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetNotExistGeneric,
1108  __kmp_hw_get_catalog_string(type));
1109  return false;
1110  }
1111 
1112  // Check to see if current layer has already been
1113  // specified either directly or through an equivalent type
1114  if (specified[equivalent_type] != KMP_HW_UNKNOWN) {
1115  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetEqvLayers,
1116  __kmp_hw_get_catalog_string(type),
1117  __kmp_hw_get_catalog_string(specified[equivalent_type]));
1118  return false;
1119  }
1120  specified[equivalent_type] = type;
1121 
1122  // Check to see if each layer's num & offset parameters are valid
1123  max_count = get_ratio(level);
1124  if (max_count < 0 ||
1125  (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1126  bool plural = (num > 1);
1127  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric,
1128  __kmp_hw_get_catalog_string(type, plural));
1129  return false;
1130  }
1131 
1132  // Check to see if core attributes are consistent
1133  if (core_level == level) {
1134  // Determine which core attributes are specified
1135  for (int j = 0; j < item.num_attrs; ++j) {
1136  if (item.attr[j].is_core_type_valid())
1137  using_core_types = true;
1138  if (item.attr[j].is_core_eff_valid())
1139  using_core_effs = true;
1140  }
1141 
1142  // Check if using a single core attribute on non-hybrid arch.
1143  // Do not ignore all of KMP_HW_SUBSET, just ignore the attribute.
1144  //
1145  // Check if using multiple core attributes on non-hyrbid arch.
1146  // Ignore all of KMP_HW_SUBSET if this is the case.
1147  if ((using_core_effs || using_core_types) && !__kmp_is_hybrid_cpu()) {
1148  if (item.num_attrs == 1) {
1149  if (using_core_effs) {
1150  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1151  "efficiency");
1152  } else {
1153  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIgnoringAttr,
1154  "core_type");
1155  }
1156  using_core_effs = false;
1157  using_core_types = false;
1158  } else {
1159  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrsNonHybrid);
1160  return false;
1161  }
1162  }
1163 
1164  // Check if using both core types and core efficiencies together
1165  if (using_core_types && using_core_effs) {
1166  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat, "core_type",
1167  "efficiency");
1168  return false;
1169  }
1170 
1171  // Check that core efficiency values are valid
1172  if (using_core_effs) {
1173  for (int j = 0; j < item.num_attrs; ++j) {
1174  if (item.attr[j].is_core_eff_valid()) {
1175  int core_eff = item.attr[j].get_core_eff();
1176  if (core_eff < 0 || core_eff >= num_core_efficiencies) {
1177  kmp_str_buf_t buf;
1178  __kmp_str_buf_init(&buf);
1179  __kmp_str_buf_print(&buf, "%d", item.attr[j].get_core_eff());
1180  __kmp_msg(kmp_ms_warning,
1181  KMP_MSG(AffHWSubsetAttrInvalid, "efficiency", buf.str),
1182  KMP_HNT(ValidValuesRange, 0, num_core_efficiencies - 1),
1183  __kmp_msg_null);
1184  __kmp_str_buf_free(&buf);
1185  return false;
1186  }
1187  }
1188  }
1189  }
1190 
1191  // Check that the number of requested cores with attributes is valid
1192  if (using_core_types || using_core_effs) {
1193  for (int j = 0; j < item.num_attrs; ++j) {
1194  int num = item.num[j];
1195  int offset = item.offset[j];
1196  int level_above = core_level - 1;
1197  if (level_above >= 0) {
1198  max_count = get_ncores_with_attr_per(item.attr[j], level_above);
1199  if (max_count <= 0 ||
1200  (num != kmp_hw_subset_t::USE_ALL && num + offset > max_count)) {
1201  kmp_str_buf_t buf;
1202  __kmp_hw_get_catalog_core_string(item.attr[j], &buf, num > 0);
1203  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetManyGeneric, buf.str);
1204  __kmp_str_buf_free(&buf);
1205  return false;
1206  }
1207  }
1208  }
1209  }
1210 
1211  if ((using_core_types || using_core_effs) && item.num_attrs > 1) {
1212  for (int j = 0; j < item.num_attrs; ++j) {
1213  // Ambiguous use of specific core attribute + generic core
1214  // e.g., 4c & 3c:intel_core or 4c & 3c:eff1
1215  if (!item.attr[j]) {
1216  kmp_hw_attr_t other_attr;
1217  for (int k = 0; k < item.num_attrs; ++k) {
1218  if (item.attr[k] != item.attr[j]) {
1219  other_attr = item.attr[k];
1220  break;
1221  }
1222  }
1223  kmp_str_buf_t buf;
1224  __kmp_hw_get_catalog_core_string(other_attr, &buf, item.num[j] > 0);
1225  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetIncompat,
1226  __kmp_hw_get_catalog_string(KMP_HW_CORE), buf.str);
1227  __kmp_str_buf_free(&buf);
1228  return false;
1229  }
1230  // Allow specifying a specific core type or core eff exactly once
1231  for (int k = 0; k < j; ++k) {
1232  if (!item.attr[j] || !item.attr[k])
1233  continue;
1234  if (item.attr[k] == item.attr[j]) {
1235  kmp_str_buf_t buf;
1236  __kmp_hw_get_catalog_core_string(item.attr[j], &buf,
1237  item.num[j] > 0);
1238  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAttrRepeat, buf.str);
1239  __kmp_str_buf_free(&buf);
1240  return false;
1241  }
1242  }
1243  }
1244  }
1245  }
1246  }
1247 
1248  struct core_type_indexer {
1249  int operator()(const kmp_hw_thread_t &t) const {
1250  switch (t.attrs.get_core_type()) {
1251  case KMP_HW_CORE_TYPE_UNKNOWN:
1252  case KMP_HW_MAX_NUM_CORE_TYPES:
1253  return 0;
1254 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1255  case KMP_HW_CORE_TYPE_ATOM:
1256  return 1;
1257  case KMP_HW_CORE_TYPE_CORE:
1258  return 2;
1259 #endif
1260  }
1261  KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
1262  KMP_BUILTIN_UNREACHABLE;
1263  }
1264  };
1265  struct core_eff_indexer {
1266  int operator()(const kmp_hw_thread_t &t) const {
1267  return t.attrs.get_core_eff();
1268  }
1269  };
1270 
1271  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_TYPES, core_type_indexer> core_type_sub_ids(
1272  core_level);
1273  kmp_sub_ids_t<KMP_HW_MAX_NUM_CORE_EFFS, core_eff_indexer> core_eff_sub_ids(
1274  core_level);
1275 
1276  // Determine which hardware threads should be filtered.
1277  int num_filtered = 0;
1278  kmp_affin_mask_t *filtered_mask;
1279  KMP_CPU_ALLOC(filtered_mask);
1280  KMP_CPU_COPY(filtered_mask, __kmp_affin_fullMask);
1281  for (int i = 0; i < num_hw_threads; ++i) {
1282  kmp_hw_thread_t &hw_thread = hw_threads[i];
1283  // Update type_sub_id
1284  if (using_core_types)
1285  core_type_sub_ids.update(hw_thread);
1286  if (using_core_effs)
1287  core_eff_sub_ids.update(hw_thread);
1288 
1289  // Check to see if this hardware thread should be filtered
1290  bool should_be_filtered = false;
1291  for (int hw_subset_index = 0; hw_subset_index < hw_subset_depth;
1292  ++hw_subset_index) {
1293  const auto &hw_subset_item = __kmp_hw_subset->at(hw_subset_index);
1294  int level = topology_levels[hw_subset_index];
1295  if (level == -1)
1296  continue;
1297  if ((using_core_effs || using_core_types) && level == core_level) {
1298  // Look for the core attribute in KMP_HW_SUBSET which corresponds
1299  // to this hardware thread's core attribute. Use this num,offset plus
1300  // the running sub_id for the particular core attribute of this hardware
1301  // thread to determine if the hardware thread should be filtered or not.
1302  int attr_idx;
1303  kmp_hw_core_type_t core_type = hw_thread.attrs.get_core_type();
1304  int core_eff = hw_thread.attrs.get_core_eff();
1305  for (attr_idx = 0; attr_idx < hw_subset_item.num_attrs; ++attr_idx) {
1306  if (using_core_types &&
1307  hw_subset_item.attr[attr_idx].get_core_type() == core_type)
1308  break;
1309  if (using_core_effs &&
1310  hw_subset_item.attr[attr_idx].get_core_eff() == core_eff)
1311  break;
1312  }
1313  // This core attribute isn't in the KMP_HW_SUBSET so always filter it.
1314  if (attr_idx == hw_subset_item.num_attrs) {
1315  should_be_filtered = true;
1316  break;
1317  }
1318  int sub_id;
1319  int num = hw_subset_item.num[attr_idx];
1320  int offset = hw_subset_item.offset[attr_idx];
1321  if (using_core_types)
1322  sub_id = core_type_sub_ids.get_sub_id(hw_thread);
1323  else
1324  sub_id = core_eff_sub_ids.get_sub_id(hw_thread);
1325  if (sub_id < offset ||
1326  (num != kmp_hw_subset_t::USE_ALL && sub_id >= offset + num)) {
1327  should_be_filtered = true;
1328  break;
1329  }
1330  } else {
1331  int num = hw_subset_item.num[0];
1332  int offset = hw_subset_item.offset[0];
1333  if (hw_thread.sub_ids[level] < offset ||
1334  (num != kmp_hw_subset_t::USE_ALL &&
1335  hw_thread.sub_ids[level] >= offset + num)) {
1336  should_be_filtered = true;
1337  break;
1338  }
1339  }
1340  }
1341  // Collect filtering information
1342  if (should_be_filtered) {
1343  KMP_CPU_CLR(hw_thread.os_id, filtered_mask);
1344  num_filtered++;
1345  }
1346  }
1347 
1348  // One last check that we shouldn't allow filtering entire machine
1349  if (num_filtered == num_hw_threads) {
1350  KMP_AFF_WARNING(__kmp_affinity, AffHWSubsetAllFiltered);
1351  return false;
1352  }
1353 
1354  // Apply the filter
1355  restrict_to_mask(filtered_mask);
1356  return true;
1357 }
1358 
1359 bool kmp_topology_t::is_close(int hwt1, int hwt2,
1360  const kmp_affinity_t &stgs) const {
1361  int hw_level = stgs.gran_levels;
1362  if (hw_level >= depth)
1363  return true;
1364  bool retval = true;
1365  const kmp_hw_thread_t &t1 = hw_threads[hwt1];
1366  const kmp_hw_thread_t &t2 = hw_threads[hwt2];
1367  if (stgs.flags.core_types_gran)
1368  return t1.attrs.get_core_type() == t2.attrs.get_core_type();
1369  if (stgs.flags.core_effs_gran)
1370  return t1.attrs.get_core_eff() == t2.attrs.get_core_eff();
1371  for (int i = 0; i < (depth - hw_level); ++i) {
1372  if (t1.ids[i] != t2.ids[i])
1373  return false;
1374  }
1375  return retval;
1376 }
1377 
1379 
1380 bool KMPAffinity::picked_api = false;
1381 
1382 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
1383 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
1384 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
1385 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
1386 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
1387 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
1388 
1389 void KMPAffinity::pick_api() {
1390  KMPAffinity *affinity_dispatch;
1391  if (picked_api)
1392  return;
1393 #if KMP_USE_HWLOC
1394  // Only use Hwloc if affinity isn't explicitly disabled and
1395  // user requests Hwloc topology method
1396  if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
1397  __kmp_affinity.type != affinity_disabled) {
1398  affinity_dispatch = new KMPHwlocAffinity();
1399  } else
1400 #endif
1401  {
1402  affinity_dispatch = new KMPNativeAffinity();
1403  }
1404  __kmp_affinity_dispatch = affinity_dispatch;
1405  picked_api = true;
1406 }
1407 
1408 void KMPAffinity::destroy_api() {
1409  if (__kmp_affinity_dispatch != NULL) {
1410  delete __kmp_affinity_dispatch;
1411  __kmp_affinity_dispatch = NULL;
1412  picked_api = false;
1413  }
1414 }
1415 
1416 #define KMP_ADVANCE_SCAN(scan) \
1417  while (*scan != '\0') { \
1418  scan++; \
1419  }
1420 
1421 // Print the affinity mask to the character array in a pretty format.
1422 // The format is a comma separated list of non-negative integers or integer
1423 // ranges: e.g., 1,2,3-5,7,9-15
1424 // The format can also be the string "{<empty>}" if no bits are set in mask
1425 char *__kmp_affinity_print_mask(char *buf, int buf_len,
1426  kmp_affin_mask_t *mask) {
1427  int start = 0, finish = 0, previous = 0;
1428  bool first_range;
1429  KMP_ASSERT(buf);
1430  KMP_ASSERT(buf_len >= 40);
1431  KMP_ASSERT(mask);
1432  char *scan = buf;
1433  char *end = buf + buf_len - 1;
1434 
1435  // Check for empty set.
1436  if (mask->begin() == mask->end()) {
1437  KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
1438  KMP_ADVANCE_SCAN(scan);
1439  KMP_ASSERT(scan <= end);
1440  return buf;
1441  }
1442 
1443  first_range = true;
1444  start = mask->begin();
1445  while (1) {
1446  // Find next range
1447  // [start, previous] is inclusive range of contiguous bits in mask
1448  for (finish = mask->next(start), previous = start;
1449  finish == previous + 1 && finish != mask->end();
1450  finish = mask->next(finish)) {
1451  previous = finish;
1452  }
1453 
1454  // The first range does not need a comma printed before it, but the rest
1455  // of the ranges do need a comma beforehand
1456  if (!first_range) {
1457  KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
1458  KMP_ADVANCE_SCAN(scan);
1459  } else {
1460  first_range = false;
1461  }
1462  // Range with three or more contiguous bits in the affinity mask
1463  if (previous - start > 1) {
1464  KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous);
1465  } else {
1466  // Range with one or two contiguous bits in the affinity mask
1467  KMP_SNPRINTF(scan, end - scan + 1, "%u", start);
1468  KMP_ADVANCE_SCAN(scan);
1469  if (previous - start > 0) {
1470  KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous);
1471  }
1472  }
1473  KMP_ADVANCE_SCAN(scan);
1474  // Start over with new start point
1475  start = finish;
1476  if (start == mask->end())
1477  break;
1478  // Check for overflow
1479  if (end - scan < 2)
1480  break;
1481  }
1482 
1483  // Check for overflow
1484  KMP_ASSERT(scan <= end);
1485  return buf;
1486 }
1487 #undef KMP_ADVANCE_SCAN
1488 
1489 // Print the affinity mask to the string buffer object in a pretty format
1490 // The format is a comma separated list of non-negative integers or integer
1491 // ranges: e.g., 1,2,3-5,7,9-15
1492 // The format can also be the string "{<empty>}" if no bits are set in mask
1493 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
1494  kmp_affin_mask_t *mask) {
1495  int start = 0, finish = 0, previous = 0;
1496  bool first_range;
1497  KMP_ASSERT(buf);
1498  KMP_ASSERT(mask);
1499 
1500  __kmp_str_buf_clear(buf);
1501 
1502  // Check for empty set.
1503  if (mask->begin() == mask->end()) {
1504  __kmp_str_buf_print(buf, "%s", "{<empty>}");
1505  return buf;
1506  }
1507 
1508  first_range = true;
1509  start = mask->begin();
1510  while (1) {
1511  // Find next range
1512  // [start, previous] is inclusive range of contiguous bits in mask
1513  for (finish = mask->next(start), previous = start;
1514  finish == previous + 1 && finish != mask->end();
1515  finish = mask->next(finish)) {
1516  previous = finish;
1517  }
1518 
1519  // The first range does not need a comma printed before it, but the rest
1520  // of the ranges do need a comma beforehand
1521  if (!first_range) {
1522  __kmp_str_buf_print(buf, "%s", ",");
1523  } else {
1524  first_range = false;
1525  }
1526  // Range with three or more contiguous bits in the affinity mask
1527  if (previous - start > 1) {
1528  __kmp_str_buf_print(buf, "%u-%u", start, previous);
1529  } else {
1530  // Range with one or two contiguous bits in the affinity mask
1531  __kmp_str_buf_print(buf, "%u", start);
1532  if (previous - start > 0) {
1533  __kmp_str_buf_print(buf, ",%u", previous);
1534  }
1535  }
1536  // Start over with new start point
1537  start = finish;
1538  if (start == mask->end())
1539  break;
1540  }
1541  return buf;
1542 }
1543 
1544 // Return (possibly empty) affinity mask representing the offline CPUs
1545 // Caller must free the mask
1546 kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() {
1547  kmp_affin_mask_t *offline;
1548  KMP_CPU_ALLOC(offline);
1549  KMP_CPU_ZERO(offline);
1550 #if KMP_OS_LINUX
1551  int n, begin_cpu, end_cpu;
1552  kmp_safe_raii_file_t offline_file;
1553  auto skip_ws = [](FILE *f) {
1554  int c;
1555  do {
1556  c = fgetc(f);
1557  } while (isspace(c));
1558  if (c != EOF)
1559  ungetc(c, f);
1560  };
1561  // File contains CSV of integer ranges representing the offline CPUs
1562  // e.g., 1,2,4-7,9,11-15
1563  int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r");
1564  if (status != 0)
1565  return offline;
1566  while (!feof(offline_file)) {
1567  skip_ws(offline_file);
1568  n = fscanf(offline_file, "%d", &begin_cpu);
1569  if (n != 1)
1570  break;
1571  skip_ws(offline_file);
1572  int c = fgetc(offline_file);
1573  if (c == EOF || c == ',') {
1574  // Just single CPU
1575  end_cpu = begin_cpu;
1576  } else if (c == '-') {
1577  // Range of CPUs
1578  skip_ws(offline_file);
1579  n = fscanf(offline_file, "%d", &end_cpu);
1580  if (n != 1)
1581  break;
1582  skip_ws(offline_file);
1583  c = fgetc(offline_file); // skip ','
1584  } else {
1585  // Syntax problem
1586  break;
1587  }
1588  // Ensure a valid range of CPUs
1589  if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 ||
1590  end_cpu >= __kmp_xproc || begin_cpu > end_cpu) {
1591  continue;
1592  }
1593  // Insert [begin_cpu, end_cpu] into offline mask
1594  for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) {
1595  KMP_CPU_SET(cpu, offline);
1596  }
1597  }
1598 #endif
1599  return offline;
1600 }
1601 
1602 // Return the number of available procs
1603 int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
1604  int avail_proc = 0;
1605  KMP_CPU_ZERO(mask);
1606 
1607 #if KMP_GROUP_AFFINITY
1608 
1609  if (__kmp_num_proc_groups > 1) {
1610  int group;
1611  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
1612  for (group = 0; group < __kmp_num_proc_groups; group++) {
1613  int i;
1614  int num = __kmp_GetActiveProcessorCount(group);
1615  for (i = 0; i < num; i++) {
1616  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
1617  avail_proc++;
1618  }
1619  }
1620  } else
1621 
1622 #endif /* KMP_GROUP_AFFINITY */
1623 
1624  {
1625  int proc;
1626  kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus();
1627  for (proc = 0; proc < __kmp_xproc; proc++) {
1628  // Skip offline CPUs
1629  if (KMP_CPU_ISSET(proc, offline_cpus))
1630  continue;
1631  KMP_CPU_SET(proc, mask);
1632  avail_proc++;
1633  }
1634  KMP_CPU_FREE(offline_cpus);
1635  }
1636 
1637  return avail_proc;
1638 }
1639 
1640 // All of the __kmp_affinity_create_*_map() routines should allocate the
1641 // internal topology object and set the layer ids for it. Each routine
1642 // returns a boolean on whether it was successful at doing so.
1643 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
1644 // Original mask is a subset of full mask in multiple processor groups topology
1645 kmp_affin_mask_t *__kmp_affin_origMask = NULL;
1646 
1647 #if KMP_USE_HWLOC
1648 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
1649 #if HWLOC_API_VERSION >= 0x00020000
1650  return hwloc_obj_type_is_cache(obj->type);
1651 #else
1652  return obj->type == HWLOC_OBJ_CACHE;
1653 #endif
1654 }
1655 
1656 // Returns KMP_HW_* type derived from HWLOC_* type
1657 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
1658 
1659  if (__kmp_hwloc_is_cache_type(obj)) {
1660  if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
1661  return KMP_HW_UNKNOWN;
1662  switch (obj->attr->cache.depth) {
1663  case 1:
1664  return KMP_HW_L1;
1665  case 2:
1666 #if KMP_MIC_SUPPORTED
1667  if (__kmp_mic_type == mic3) {
1668  return KMP_HW_TILE;
1669  }
1670 #endif
1671  return KMP_HW_L2;
1672  case 3:
1673  return KMP_HW_L3;
1674  }
1675  return KMP_HW_UNKNOWN;
1676  }
1677 
1678  switch (obj->type) {
1679  case HWLOC_OBJ_PACKAGE:
1680  return KMP_HW_SOCKET;
1681  case HWLOC_OBJ_NUMANODE:
1682  return KMP_HW_NUMA;
1683  case HWLOC_OBJ_CORE:
1684  return KMP_HW_CORE;
1685  case HWLOC_OBJ_PU:
1686  return KMP_HW_THREAD;
1687  case HWLOC_OBJ_GROUP:
1688 #if HWLOC_API_VERSION >= 0x00020000
1689  if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE)
1690  return KMP_HW_DIE;
1691  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE)
1692  return KMP_HW_TILE;
1693  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE)
1694  return KMP_HW_MODULE;
1695  else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP)
1696  return KMP_HW_PROC_GROUP;
1697 #endif
1698  return KMP_HW_UNKNOWN;
1699 #if HWLOC_API_VERSION >= 0x00020100
1700  case HWLOC_OBJ_DIE:
1701  return KMP_HW_DIE;
1702 #endif
1703  }
1704  return KMP_HW_UNKNOWN;
1705 }
1706 
1707 // Returns the number of objects of type 'type' below 'obj' within the topology
1708 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
1709 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
1710 // object.
1711 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
1712  hwloc_obj_type_t type) {
1713  int retval = 0;
1714  hwloc_obj_t first;
1715  for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
1716  obj->logical_index, type, 0);
1717  first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology,
1718  obj->type, first) == obj;
1719  first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
1720  first)) {
1721  ++retval;
1722  }
1723  return retval;
1724 }
1725 
1726 // This gets the sub_id for a lower object under a higher object in the
1727 // topology tree
1728 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
1729  hwloc_obj_t lower) {
1730  hwloc_obj_t obj;
1731  hwloc_obj_type_t ltype = lower->type;
1732  int lindex = lower->logical_index - 1;
1733  int sub_id = 0;
1734  // Get the previous lower object
1735  obj = hwloc_get_obj_by_type(t, ltype, lindex);
1736  while (obj && lindex >= 0 &&
1737  hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
1738  if (obj->userdata) {
1739  sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
1740  break;
1741  }
1742  sub_id++;
1743  lindex--;
1744  obj = hwloc_get_obj_by_type(t, ltype, lindex);
1745  }
1746  // store sub_id + 1 so that 0 is differed from NULL
1747  lower->userdata = RCAST(void *, sub_id + 1);
1748  return sub_id;
1749 }
1750 
1751 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) {
1752  kmp_hw_t type;
1753  int hw_thread_index, sub_id;
1754  int depth;
1755  hwloc_obj_t pu, obj, root, prev;
1756  kmp_hw_t types[KMP_HW_LAST];
1757  hwloc_obj_type_t hwloc_types[KMP_HW_LAST];
1758 
1759  hwloc_topology_t tp = __kmp_hwloc_topology;
1760  *msg_id = kmp_i18n_null;
1761  if (__kmp_affinity.flags.verbose) {
1762  KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
1763  }
1764 
1765  if (!KMP_AFFINITY_CAPABLE()) {
1766  // Hack to try and infer the machine topology using only the data
1767  // available from hwloc on the current thread, and __kmp_xproc.
1768  KMP_ASSERT(__kmp_affinity.type == affinity_none);
1769  // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
1770  hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
1771  if (o != NULL)
1772  nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
1773  else
1774  nCoresPerPkg = 1; // no PACKAGE found
1775  o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
1776  if (o != NULL)
1777  __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
1778  else
1779  __kmp_nThreadsPerCore = 1; // no CORE found
1780  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1781  if (nCoresPerPkg == 0)
1782  nCoresPerPkg = 1; // to prevent possible division by 0
1783  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1784  return true;
1785  }
1786 
1787 #if HWLOC_API_VERSION >= 0x00020400
1788  // Handle multiple types of cores if they exist on the system
1789  int nr_cpu_kinds = hwloc_cpukinds_get_nr(tp, 0);
1790 
1791  typedef struct kmp_hwloc_cpukinds_info_t {
1792  int efficiency;
1793  kmp_hw_core_type_t core_type;
1794  hwloc_bitmap_t mask;
1795  } kmp_hwloc_cpukinds_info_t;
1796  kmp_hwloc_cpukinds_info_t *cpukinds = nullptr;
1797 
1798  if (nr_cpu_kinds > 0) {
1799  unsigned nr_infos;
1800  struct hwloc_info_s *infos;
1801  cpukinds = (kmp_hwloc_cpukinds_info_t *)__kmp_allocate(
1802  sizeof(kmp_hwloc_cpukinds_info_t) * nr_cpu_kinds);
1803  for (unsigned idx = 0; idx < (unsigned)nr_cpu_kinds; ++idx) {
1804  cpukinds[idx].efficiency = -1;
1805  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_UNKNOWN;
1806  cpukinds[idx].mask = hwloc_bitmap_alloc();
1807  if (hwloc_cpukinds_get_info(tp, idx, cpukinds[idx].mask,
1808  &cpukinds[idx].efficiency, &nr_infos, &infos,
1809  0) == 0) {
1810  for (unsigned i = 0; i < nr_infos; ++i) {
1811  if (__kmp_str_match("CoreType", 8, infos[i].name)) {
1812 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1813  if (__kmp_str_match("IntelAtom", 9, infos[i].value)) {
1814  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_ATOM;
1815  break;
1816  } else if (__kmp_str_match("IntelCore", 9, infos[i].value)) {
1817  cpukinds[idx].core_type = KMP_HW_CORE_TYPE_CORE;
1818  break;
1819  }
1820 #endif
1821  }
1822  }
1823  }
1824  }
1825  }
1826 #endif
1827 
1828  root = hwloc_get_root_obj(tp);
1829 
1830  // Figure out the depth and types in the topology
1831  depth = 0;
1832  pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
1833  KMP_ASSERT(pu);
1834  obj = pu;
1835  types[depth] = KMP_HW_THREAD;
1836  hwloc_types[depth] = obj->type;
1837  depth++;
1838  while (obj != root && obj != NULL) {
1839  obj = obj->parent;
1840 #if HWLOC_API_VERSION >= 0x00020000
1841  if (obj->memory_arity) {
1842  hwloc_obj_t memory;
1843  for (memory = obj->memory_first_child; memory;
1844  memory = hwloc_get_next_child(tp, obj, memory)) {
1845  if (memory->type == HWLOC_OBJ_NUMANODE)
1846  break;
1847  }
1848  if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1849  types[depth] = KMP_HW_NUMA;
1850  hwloc_types[depth] = memory->type;
1851  depth++;
1852  }
1853  }
1854 #endif
1855  type = __kmp_hwloc_type_2_topology_type(obj);
1856  if (type != KMP_HW_UNKNOWN) {
1857  types[depth] = type;
1858  hwloc_types[depth] = obj->type;
1859  depth++;
1860  }
1861  }
1862  KMP_ASSERT(depth > 0);
1863 
1864  // Get the order for the types correct
1865  for (int i = 0, j = depth - 1; i < j; ++i, --j) {
1866  hwloc_obj_type_t hwloc_temp = hwloc_types[i];
1867  kmp_hw_t temp = types[i];
1868  types[i] = types[j];
1869  types[j] = temp;
1870  hwloc_types[i] = hwloc_types[j];
1871  hwloc_types[j] = hwloc_temp;
1872  }
1873 
1874  // Allocate the data structure to be returned.
1875  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1876 
1877  hw_thread_index = 0;
1878  pu = NULL;
1879  while ((pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu))) {
1880  int index = depth - 1;
1881  bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
1882  kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
1883  if (included) {
1884  hw_thread.clear();
1885  hw_thread.ids[index] = pu->logical_index;
1886  hw_thread.os_id = pu->os_index;
1887  // If multiple core types, then set that attribute for the hardware thread
1888 #if HWLOC_API_VERSION >= 0x00020400
1889  if (cpukinds) {
1890  int cpukind_index = -1;
1891  for (int i = 0; i < nr_cpu_kinds; ++i) {
1892  if (hwloc_bitmap_isset(cpukinds[i].mask, hw_thread.os_id)) {
1893  cpukind_index = i;
1894  break;
1895  }
1896  }
1897  if (cpukind_index >= 0) {
1898  hw_thread.attrs.set_core_type(cpukinds[cpukind_index].core_type);
1899  hw_thread.attrs.set_core_eff(cpukinds[cpukind_index].efficiency);
1900  }
1901  }
1902 #endif
1903  index--;
1904  }
1905  obj = pu;
1906  prev = obj;
1907  while (obj != root && obj != NULL) {
1908  obj = obj->parent;
1909 #if HWLOC_API_VERSION >= 0x00020000
1910  // NUMA Nodes are handled differently since they are not within the
1911  // parent/child structure anymore. They are separate children
1912  // of obj (memory_first_child points to first memory child)
1913  if (obj->memory_arity) {
1914  hwloc_obj_t memory;
1915  for (memory = obj->memory_first_child; memory;
1916  memory = hwloc_get_next_child(tp, obj, memory)) {
1917  if (memory->type == HWLOC_OBJ_NUMANODE)
1918  break;
1919  }
1920  if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
1921  sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
1922  if (included) {
1923  hw_thread.ids[index] = memory->logical_index;
1924  hw_thread.ids[index + 1] = sub_id;
1925  index--;
1926  }
1927  prev = memory;
1928  }
1929  prev = obj;
1930  }
1931 #endif
1932  type = __kmp_hwloc_type_2_topology_type(obj);
1933  if (type != KMP_HW_UNKNOWN) {
1934  sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
1935  if (included) {
1936  hw_thread.ids[index] = obj->logical_index;
1937  hw_thread.ids[index + 1] = sub_id;
1938  index--;
1939  }
1940  prev = obj;
1941  }
1942  }
1943  if (included)
1944  hw_thread_index++;
1945  }
1946 
1947 #if HWLOC_API_VERSION >= 0x00020400
1948  // Free the core types information
1949  if (cpukinds) {
1950  for (int idx = 0; idx < nr_cpu_kinds; ++idx)
1951  hwloc_bitmap_free(cpukinds[idx].mask);
1952  __kmp_free(cpukinds);
1953  }
1954 #endif
1955  __kmp_topology->sort_ids();
1956  return true;
1957 }
1958 #endif // KMP_USE_HWLOC
1959 
1960 // If we don't know how to retrieve the machine's processor topology, or
1961 // encounter an error in doing so, this routine is called to form a "flat"
1962 // mapping of os thread id's <-> processor id's.
1963 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) {
1964  *msg_id = kmp_i18n_null;
1965  int depth = 3;
1966  kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1967 
1968  if (__kmp_affinity.flags.verbose) {
1969  KMP_INFORM(UsingFlatOS, "KMP_AFFINITY");
1970  }
1971 
1972  // Even if __kmp_affinity.type == affinity_none, this routine might still
1973  // be called to set __kmp_ncores, as well as
1974  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1975  if (!KMP_AFFINITY_CAPABLE()) {
1976  KMP_ASSERT(__kmp_affinity.type == affinity_none);
1977  __kmp_ncores = nPackages = __kmp_xproc;
1978  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1979  return true;
1980  }
1981 
1982  // When affinity is off, this routine will still be called to set
1983  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1984  // Make sure all these vars are set correctly, and return now if affinity is
1985  // not enabled.
1986  __kmp_ncores = nPackages = __kmp_avail_proc;
1987  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1988 
1989  // Construct the data structure to be returned.
1990  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
1991  int avail_ct = 0;
1992  int i;
1993  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1994  // Skip this proc if it is not included in the machine model.
1995  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1996  continue;
1997  }
1998  kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct);
1999  hw_thread.clear();
2000  hw_thread.os_id = i;
2001  hw_thread.ids[0] = i;
2002  hw_thread.ids[1] = 0;
2003  hw_thread.ids[2] = 0;
2004  avail_ct++;
2005  }
2006  if (__kmp_affinity.flags.verbose) {
2007  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
2008  }
2009  return true;
2010 }
2011 
2012 #if KMP_GROUP_AFFINITY
2013 // If multiple Windows* OS processor groups exist, we can create a 2-level
2014 // topology map with the groups at level 0 and the individual procs at level 1.
2015 // This facilitates letting the threads float among all procs in a group,
2016 // if granularity=group (the default when there are multiple groups).
2017 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) {
2018  *msg_id = kmp_i18n_null;
2019  int depth = 3;
2020  kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD};
2021  const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR);
2022 
2023  if (__kmp_affinity.flags.verbose) {
2024  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
2025  }
2026 
2027  // If we aren't affinity capable, then use flat topology
2028  if (!KMP_AFFINITY_CAPABLE()) {
2029  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2030  nPackages = __kmp_num_proc_groups;
2031  __kmp_nThreadsPerCore = 1;
2032  __kmp_ncores = __kmp_xproc;
2033  nCoresPerPkg = nPackages / __kmp_ncores;
2034  return true;
2035  }
2036 
2037  // Construct the data structure to be returned.
2038  __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types);
2039  int avail_ct = 0;
2040  int i;
2041  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2042  // Skip this proc if it is not included in the machine model.
2043  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2044  continue;
2045  }
2046  kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++);
2047  hw_thread.clear();
2048  hw_thread.os_id = i;
2049  hw_thread.ids[0] = i / BITS_PER_GROUP;
2050  hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP;
2051  }
2052  return true;
2053 }
2054 #endif /* KMP_GROUP_AFFINITY */
2055 
2056 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2057 
2058 template <kmp_uint32 LSB, kmp_uint32 MSB>
2059 static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
2060  const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
2061  const kmp_uint32 SHIFT_RIGHT = LSB;
2062  kmp_uint32 retval = v;
2063  retval <<= SHIFT_LEFT;
2064  retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
2065  return retval;
2066 }
2067 
2068 static int __kmp_cpuid_mask_width(int count) {
2069  int r = 0;
2070 
2071  while ((1 << r) < count)
2072  ++r;
2073  return r;
2074 }
2075 
2076 class apicThreadInfo {
2077 public:
2078  unsigned osId; // param to __kmp_affinity_bind_thread
2079  unsigned apicId; // from cpuid after binding
2080  unsigned maxCoresPerPkg; // ""
2081  unsigned maxThreadsPerPkg; // ""
2082  unsigned pkgId; // inferred from above values
2083  unsigned coreId; // ""
2084  unsigned threadId; // ""
2085 };
2086 
2087 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
2088  const void *b) {
2089  const apicThreadInfo *aa = (const apicThreadInfo *)a;
2090  const apicThreadInfo *bb = (const apicThreadInfo *)b;
2091  if (aa->pkgId < bb->pkgId)
2092  return -1;
2093  if (aa->pkgId > bb->pkgId)
2094  return 1;
2095  if (aa->coreId < bb->coreId)
2096  return -1;
2097  if (aa->coreId > bb->coreId)
2098  return 1;
2099  if (aa->threadId < bb->threadId)
2100  return -1;
2101  if (aa->threadId > bb->threadId)
2102  return 1;
2103  return 0;
2104 }
2105 
2106 class kmp_cache_info_t {
2107 public:
2108  struct info_t {
2109  unsigned level, mask;
2110  };
2111  kmp_cache_info_t() : depth(0) { get_leaf4_levels(); }
2112  size_t get_depth() const { return depth; }
2113  info_t &operator[](size_t index) { return table[index]; }
2114  const info_t &operator[](size_t index) const { return table[index]; }
2115 
2116  static kmp_hw_t get_topology_type(unsigned level) {
2117  KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL);
2118  switch (level) {
2119  case 1:
2120  return KMP_HW_L1;
2121  case 2:
2122  return KMP_HW_L2;
2123  case 3:
2124  return KMP_HW_L3;
2125  }
2126  return KMP_HW_UNKNOWN;
2127  }
2128 
2129 private:
2130  static const int MAX_CACHE_LEVEL = 3;
2131 
2132  size_t depth;
2133  info_t table[MAX_CACHE_LEVEL];
2134 
2135  void get_leaf4_levels() {
2136  unsigned level = 0;
2137  while (depth < MAX_CACHE_LEVEL) {
2138  unsigned cache_type, max_threads_sharing;
2139  unsigned cache_level, cache_mask_width;
2140  kmp_cpuid buf2;
2141  __kmp_x86_cpuid(4, level, &buf2);
2142  cache_type = __kmp_extract_bits<0, 4>(buf2.eax);
2143  if (!cache_type)
2144  break;
2145  // Skip instruction caches
2146  if (cache_type == 2) {
2147  level++;
2148  continue;
2149  }
2150  max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1;
2151  cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing);
2152  cache_level = __kmp_extract_bits<5, 7>(buf2.eax);
2153  table[depth].level = cache_level;
2154  table[depth].mask = ((-1) << cache_mask_width);
2155  depth++;
2156  level++;
2157  }
2158  }
2159 };
2160 
2161 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
2162 // an algorithm which cycles through the available os threads, setting
2163 // the current thread's affinity mask to that thread, and then retrieves
2164 // the Apic Id for each thread context using the cpuid instruction.
2165 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) {
2166  kmp_cpuid buf;
2167  *msg_id = kmp_i18n_null;
2168 
2169  if (__kmp_affinity.flags.verbose) {
2170  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
2171  }
2172 
2173  // Check if cpuid leaf 4 is supported.
2174  __kmp_x86_cpuid(0, 0, &buf);
2175  if (buf.eax < 4) {
2176  *msg_id = kmp_i18n_str_NoLeaf4Support;
2177  return false;
2178  }
2179 
2180  // The algorithm used starts by setting the affinity to each available thread
2181  // and retrieving info from the cpuid instruction, so if we are not capable of
2182  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
2183  // need to do something else - use the defaults that we calculated from
2184  // issuing cpuid without binding to each proc.
2185  if (!KMP_AFFINITY_CAPABLE()) {
2186  // Hack to try and infer the machine topology using only the data
2187  // available from cpuid on the current thread, and __kmp_xproc.
2188  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2189 
2190  // Get an upper bound on the number of threads per package using cpuid(1).
2191  // On some OS/chps combinations where HT is supported by the chip but is
2192  // disabled, this value will be 2 on a single core chip. Usually, it will be
2193  // 2 if HT is enabled and 1 if HT is disabled.
2194  __kmp_x86_cpuid(1, 0, &buf);
2195  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2196  if (maxThreadsPerPkg == 0) {
2197  maxThreadsPerPkg = 1;
2198  }
2199 
2200  // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
2201  // value.
2202  //
2203  // The author of cpu_count.cpp treated this only an upper bound on the
2204  // number of cores, but I haven't seen any cases where it was greater than
2205  // the actual number of cores, so we will treat it as exact in this block of
2206  // code.
2207  //
2208  // First, we need to check if cpuid(4) is supported on this chip. To see if
2209  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
2210  // greater.
2211  __kmp_x86_cpuid(0, 0, &buf);
2212  if (buf.eax >= 4) {
2213  __kmp_x86_cpuid(4, 0, &buf);
2214  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2215  } else {
2216  nCoresPerPkg = 1;
2217  }
2218 
2219  // There is no way to reliably tell if HT is enabled without issuing the
2220  // cpuid instruction from every thread, can correlating the cpuid info, so
2221  // if the machine is not affinity capable, we assume that HT is off. We have
2222  // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
2223  // does not support HT.
2224  //
2225  // - Older OSes are usually found on machines with older chips, which do not
2226  // support HT.
2227  // - The performance penalty for mistakenly identifying a machine as HT when
2228  // it isn't (which results in blocktime being incorrectly set to 0) is
2229  // greater than the penalty when for mistakenly identifying a machine as
2230  // being 1 thread/core when it is really HT enabled (which results in
2231  // blocktime being incorrectly set to a positive value).
2232  __kmp_ncores = __kmp_xproc;
2233  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2234  __kmp_nThreadsPerCore = 1;
2235  return true;
2236  }
2237 
2238  // From here on, we can assume that it is safe to call
2239  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2240  // __kmp_affinity.type = affinity_none.
2241 
2242  // Save the affinity mask for the current thread.
2243  kmp_affinity_raii_t previous_affinity;
2244 
2245  // Run through each of the available contexts, binding the current thread
2246  // to it, and obtaining the pertinent information using the cpuid instr.
2247  //
2248  // The relevant information is:
2249  // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
2250  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
2251  // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
2252  // of this field determines the width of the core# + thread# fields in the
2253  // Apic Id. It is also an upper bound on the number of threads per
2254  // package, but it has been verified that situations happen were it is not
2255  // exact. In particular, on certain OS/chip combinations where Intel(R)
2256  // Hyper-Threading Technology is supported by the chip but has been
2257  // disabled, the value of this field will be 2 (for a single core chip).
2258  // On other OS/chip combinations supporting Intel(R) Hyper-Threading
2259  // Technology, the value of this field will be 1 when Intel(R)
2260  // Hyper-Threading Technology is disabled and 2 when it is enabled.
2261  // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
2262  // of this field (+1) determines the width of the core# field in the Apic
2263  // Id. The comments in "cpucount.cpp" say that this value is an upper
2264  // bound, but the IA-32 architecture manual says that it is exactly the
2265  // number of cores per package, and I haven't seen any case where it
2266  // wasn't.
2267  //
2268  // From this information, deduce the package Id, core Id, and thread Id,
2269  // and set the corresponding fields in the apicThreadInfo struct.
2270  unsigned i;
2271  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
2272  __kmp_avail_proc * sizeof(apicThreadInfo));
2273  unsigned nApics = 0;
2274  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
2275  // Skip this proc if it is not included in the machine model.
2276  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
2277  continue;
2278  }
2279  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
2280 
2281  __kmp_affinity_dispatch->bind_thread(i);
2282  threadInfo[nApics].osId = i;
2283 
2284  // The apic id and max threads per pkg come from cpuid(1).
2285  __kmp_x86_cpuid(1, 0, &buf);
2286  if (((buf.edx >> 9) & 1) == 0) {
2287  __kmp_free(threadInfo);
2288  *msg_id = kmp_i18n_str_ApicNotPresent;
2289  return false;
2290  }
2291  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
2292  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
2293  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
2294  threadInfo[nApics].maxThreadsPerPkg = 1;
2295  }
2296 
2297  // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
2298  // value.
2299  //
2300  // First, we need to check if cpuid(4) is supported on this chip. To see if
2301  // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
2302  // or greater.
2303  __kmp_x86_cpuid(0, 0, &buf);
2304  if (buf.eax >= 4) {
2305  __kmp_x86_cpuid(4, 0, &buf);
2306  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
2307  } else {
2308  threadInfo[nApics].maxCoresPerPkg = 1;
2309  }
2310 
2311  // Infer the pkgId / coreId / threadId using only the info obtained locally.
2312  int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
2313  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
2314 
2315  int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
2316  int widthT = widthCT - widthC;
2317  if (widthT < 0) {
2318  // I've never seen this one happen, but I suppose it could, if the cpuid
2319  // instruction on a chip was really screwed up. Make sure to restore the
2320  // affinity mask before the tail call.
2321  __kmp_free(threadInfo);
2322  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2323  return false;
2324  }
2325 
2326  int maskC = (1 << widthC) - 1;
2327  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
2328 
2329  int maskT = (1 << widthT) - 1;
2330  threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
2331 
2332  nApics++;
2333  }
2334 
2335  // We've collected all the info we need.
2336  // Restore the old affinity mask for this thread.
2337  previous_affinity.restore();
2338 
2339  // Sort the threadInfo table by physical Id.
2340  qsort(threadInfo, nApics, sizeof(*threadInfo),
2341  __kmp_affinity_cmp_apicThreadInfo_phys_id);
2342 
2343  // The table is now sorted by pkgId / coreId / threadId, but we really don't
2344  // know the radix of any of the fields. pkgId's may be sparsely assigned among
2345  // the chips on a system. Although coreId's are usually assigned
2346  // [0 .. coresPerPkg-1] and threadId's are usually assigned
2347  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2348  //
2349  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2350  // total # packages) are at this point - we want to determine that now. We
2351  // only have an upper bound on the first two figures.
2352  //
2353  // We also perform a consistency check at this point: the values returned by
2354  // the cpuid instruction for any thread bound to a given package had better
2355  // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
2356  nPackages = 1;
2357  nCoresPerPkg = 1;
2358  __kmp_nThreadsPerCore = 1;
2359  unsigned nCores = 1;
2360 
2361  unsigned pkgCt = 1; // to determine radii
2362  unsigned lastPkgId = threadInfo[0].pkgId;
2363  unsigned coreCt = 1;
2364  unsigned lastCoreId = threadInfo[0].coreId;
2365  unsigned threadCt = 1;
2366  unsigned lastThreadId = threadInfo[0].threadId;
2367 
2368  // intra-pkg consist checks
2369  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
2370  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
2371 
2372  for (i = 1; i < nApics; i++) {
2373  if (threadInfo[i].pkgId != lastPkgId) {
2374  nCores++;
2375  pkgCt++;
2376  lastPkgId = threadInfo[i].pkgId;
2377  if ((int)coreCt > nCoresPerPkg)
2378  nCoresPerPkg = coreCt;
2379  coreCt = 1;
2380  lastCoreId = threadInfo[i].coreId;
2381  if ((int)threadCt > __kmp_nThreadsPerCore)
2382  __kmp_nThreadsPerCore = threadCt;
2383  threadCt = 1;
2384  lastThreadId = threadInfo[i].threadId;
2385 
2386  // This is a different package, so go on to the next iteration without
2387  // doing any consistency checks. Reset the consistency check vars, though.
2388  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
2389  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
2390  continue;
2391  }
2392 
2393  if (threadInfo[i].coreId != lastCoreId) {
2394  nCores++;
2395  coreCt++;
2396  lastCoreId = threadInfo[i].coreId;
2397  if ((int)threadCt > __kmp_nThreadsPerCore)
2398  __kmp_nThreadsPerCore = threadCt;
2399  threadCt = 1;
2400  lastThreadId = threadInfo[i].threadId;
2401  } else if (threadInfo[i].threadId != lastThreadId) {
2402  threadCt++;
2403  lastThreadId = threadInfo[i].threadId;
2404  } else {
2405  __kmp_free(threadInfo);
2406  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2407  return false;
2408  }
2409 
2410  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
2411  // fields agree between all the threads bounds to a given package.
2412  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
2413  (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
2414  __kmp_free(threadInfo);
2415  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
2416  return false;
2417  }
2418  }
2419  // When affinity is off, this routine will still be called to set
2420  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2421  // Make sure all these vars are set correctly
2422  nPackages = pkgCt;
2423  if ((int)coreCt > nCoresPerPkg)
2424  nCoresPerPkg = coreCt;
2425  if ((int)threadCt > __kmp_nThreadsPerCore)
2426  __kmp_nThreadsPerCore = threadCt;
2427  __kmp_ncores = nCores;
2428  KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
2429 
2430  // Now that we've determined the number of packages, the number of cores per
2431  // package, and the number of threads per core, we can construct the data
2432  // structure that is to be returned.
2433  int idx = 0;
2434  int pkgLevel = 0;
2435  int coreLevel = 1;
2436  int threadLevel = 2;
2437  //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
2438  int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
2439  kmp_hw_t types[3];
2440  if (pkgLevel >= 0)
2441  types[idx++] = KMP_HW_SOCKET;
2442  if (coreLevel >= 0)
2443  types[idx++] = KMP_HW_CORE;
2444  if (threadLevel >= 0)
2445  types[idx++] = KMP_HW_THREAD;
2446 
2447  KMP_ASSERT(depth > 0);
2448  __kmp_topology = kmp_topology_t::allocate(nApics, depth, types);
2449 
2450  for (i = 0; i < nApics; ++i) {
2451  idx = 0;
2452  unsigned os = threadInfo[i].osId;
2453  kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
2454  hw_thread.clear();
2455 
2456  if (pkgLevel >= 0) {
2457  hw_thread.ids[idx++] = threadInfo[i].pkgId;
2458  }
2459  if (coreLevel >= 0) {
2460  hw_thread.ids[idx++] = threadInfo[i].coreId;
2461  }
2462  if (threadLevel >= 0) {
2463  hw_thread.ids[idx++] = threadInfo[i].threadId;
2464  }
2465  hw_thread.os_id = os;
2466  }
2467 
2468  __kmp_free(threadInfo);
2469  __kmp_topology->sort_ids();
2470  if (!__kmp_topology->check_ids()) {
2471  kmp_topology_t::deallocate(__kmp_topology);
2472  __kmp_topology = nullptr;
2473  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
2474  return false;
2475  }
2476  return true;
2477 }
2478 
2479 // Hybrid cpu detection using CPUID.1A
2480 // Thread should be pinned to processor already
2481 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, int *efficiency,
2482  unsigned *native_model_id) {
2483  kmp_cpuid buf;
2484  __kmp_x86_cpuid(0x1a, 0, &buf);
2485  *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax);
2486  switch (*type) {
2487  case KMP_HW_CORE_TYPE_ATOM:
2488  *efficiency = 0;
2489  break;
2490  case KMP_HW_CORE_TYPE_CORE:
2491  *efficiency = 1;
2492  break;
2493  default:
2494  *efficiency = 0;
2495  }
2496  *native_model_id = __kmp_extract_bits<0, 23>(buf.eax);
2497 }
2498 
2499 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
2500 // architectures support a newer interface for specifying the x2APIC Ids,
2501 // based on CPUID.B or CPUID.1F
2502 /*
2503  * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
2504  Bits Bits Bits Bits
2505  31-16 15-8 7-4 4-0
2506 ---+-----------+--------------+-------------+-----------------+
2507 EAX| reserved | reserved | reserved | Bits to Shift |
2508 ---+-----------|--------------+-------------+-----------------|
2509 EBX| reserved | Num logical processors at level (16 bits) |
2510 ---+-----------|--------------+-------------------------------|
2511 ECX| reserved | Level Type | Level Number (8 bits) |
2512 ---+-----------+--------------+-------------------------------|
2513 EDX| X2APIC ID (32 bits) |
2514 ---+----------------------------------------------------------+
2515 */
2516 
2517 enum {
2518  INTEL_LEVEL_TYPE_INVALID = 0, // Package level
2519  INTEL_LEVEL_TYPE_SMT = 1,
2520  INTEL_LEVEL_TYPE_CORE = 2,
2521  INTEL_LEVEL_TYPE_MODULE = 3,
2522  INTEL_LEVEL_TYPE_TILE = 4,
2523  INTEL_LEVEL_TYPE_DIE = 5,
2524  INTEL_LEVEL_TYPE_LAST = 6,
2525 };
2526 
2527 struct cpuid_level_info_t {
2528  unsigned level_type, mask, mask_width, nitems, cache_mask;
2529 };
2530 
2531 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
2532  switch (intel_type) {
2533  case INTEL_LEVEL_TYPE_INVALID:
2534  return KMP_HW_SOCKET;
2535  case INTEL_LEVEL_TYPE_SMT:
2536  return KMP_HW_THREAD;
2537  case INTEL_LEVEL_TYPE_CORE:
2538  return KMP_HW_CORE;
2539  case INTEL_LEVEL_TYPE_TILE:
2540  return KMP_HW_TILE;
2541  case INTEL_LEVEL_TYPE_MODULE:
2542  return KMP_HW_MODULE;
2543  case INTEL_LEVEL_TYPE_DIE:
2544  return KMP_HW_DIE;
2545  }
2546  return KMP_HW_UNKNOWN;
2547 }
2548 
2549 // This function takes the topology leaf, a levels array to store the levels
2550 // detected and a bitmap of the known levels.
2551 // Returns the number of levels in the topology
2552 static unsigned
2553 __kmp_x2apicid_get_levels(int leaf,
2554  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
2555  kmp_uint64 known_levels) {
2556  unsigned level, levels_index;
2557  unsigned level_type, mask_width, nitems;
2558  kmp_cpuid buf;
2559 
2560  // New algorithm has known topology layers act as highest unknown topology
2561  // layers when unknown topology layers exist.
2562  // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z>
2563  // are unknown topology layers, Then SMT will take the characteristics of
2564  // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>).
2565  // This eliminates unknown portions of the topology while still keeping the
2566  // correct structure.
2567  level = levels_index = 0;
2568  do {
2569  __kmp_x86_cpuid(leaf, level, &buf);
2570  level_type = __kmp_extract_bits<8, 15>(buf.ecx);
2571  mask_width = __kmp_extract_bits<0, 4>(buf.eax);
2572  nitems = __kmp_extract_bits<0, 15>(buf.ebx);
2573  if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
2574  return 0;
2575 
2576  if (known_levels & (1ull << level_type)) {
2577  // Add a new level to the topology
2578  KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
2579  levels[levels_index].level_type = level_type;
2580  levels[levels_index].mask_width = mask_width;
2581  levels[levels_index].nitems = nitems;
2582  levels_index++;
2583  } else {
2584  // If it is an unknown level, then logically move the previous layer up
2585  if (levels_index > 0) {
2586  levels[levels_index - 1].mask_width = mask_width;
2587  levels[levels_index - 1].nitems = nitems;
2588  }
2589  }
2590  level++;
2591  } while (level_type != INTEL_LEVEL_TYPE_INVALID);
2592 
2593  // Ensure the INTEL_LEVEL_TYPE_INVALID (Socket) layer isn't first
2594  if (levels_index == 0 || levels[0].level_type == INTEL_LEVEL_TYPE_INVALID)
2595  return 0;
2596 
2597  // Set the masks to & with apicid
2598  for (unsigned i = 0; i < levels_index; ++i) {
2599  if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
2600  levels[i].mask = ~((-1) << levels[i].mask_width);
2601  levels[i].cache_mask = (-1) << levels[i].mask_width;
2602  for (unsigned j = 0; j < i; ++j)
2603  levels[i].mask ^= levels[j].mask;
2604  } else {
2605  KMP_DEBUG_ASSERT(i > 0);
2606  levels[i].mask = (-1) << levels[i - 1].mask_width;
2607  levels[i].cache_mask = 0;
2608  }
2609  }
2610  return levels_index;
2611 }
2612 
2613 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) {
2614 
2615  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
2616  kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
2617  unsigned levels_index;
2618  kmp_cpuid buf;
2619  kmp_uint64 known_levels;
2620  int topology_leaf, highest_leaf, apic_id;
2621  int num_leaves;
2622  static int leaves[] = {0, 0};
2623 
2624  kmp_i18n_id_t leaf_message_id;
2625 
2626  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
2627 
2628  *msg_id = kmp_i18n_null;
2629  if (__kmp_affinity.flags.verbose) {
2630  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
2631  }
2632 
2633  // Figure out the known topology levels
2634  known_levels = 0ull;
2635  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
2636  if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
2637  known_levels |= (1ull << i);
2638  }
2639  }
2640 
2641  // Get the highest cpuid leaf supported
2642  __kmp_x86_cpuid(0, 0, &buf);
2643  highest_leaf = buf.eax;
2644 
2645  // If a specific topology method was requested, only allow that specific leaf
2646  // otherwise, try both leaves 31 and 11 in that order
2647  num_leaves = 0;
2648  if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
2649  num_leaves = 1;
2650  leaves[0] = 11;
2651  leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2652  } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
2653  num_leaves = 1;
2654  leaves[0] = 31;
2655  leaf_message_id = kmp_i18n_str_NoLeaf31Support;
2656  } else {
2657  num_leaves = 2;
2658  leaves[0] = 31;
2659  leaves[1] = 11;
2660  leaf_message_id = kmp_i18n_str_NoLeaf11Support;
2661  }
2662 
2663  // Check to see if cpuid leaf 31 or 11 is supported.
2664  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2665  topology_leaf = -1;
2666  for (int i = 0; i < num_leaves; ++i) {
2667  int leaf = leaves[i];
2668  if (highest_leaf < leaf)
2669  continue;
2670  __kmp_x86_cpuid(leaf, 0, &buf);
2671  if (buf.ebx == 0)
2672  continue;
2673  topology_leaf = leaf;
2674  levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
2675  if (levels_index == 0)
2676  continue;
2677  break;
2678  }
2679  if (topology_leaf == -1 || levels_index == 0) {
2680  *msg_id = leaf_message_id;
2681  return false;
2682  }
2683  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
2684 
2685  // The algorithm used starts by setting the affinity to each available thread
2686  // and retrieving info from the cpuid instruction, so if we are not capable of
2687  // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
2688  // we need to do something else - use the defaults that we calculated from
2689  // issuing cpuid without binding to each proc.
2690  if (!KMP_AFFINITY_CAPABLE()) {
2691  // Hack to try and infer the machine topology using only the data
2692  // available from cpuid on the current thread, and __kmp_xproc.
2693  KMP_ASSERT(__kmp_affinity.type == affinity_none);
2694  for (unsigned i = 0; i < levels_index; ++i) {
2695  if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
2696  __kmp_nThreadsPerCore = levels[i].nitems;
2697  } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
2698  nCoresPerPkg = levels[i].nitems;
2699  }
2700  }
2701  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
2702  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
2703  return true;
2704  }
2705 
2706  // Allocate the data structure to be returned.
2707  int depth = levels_index;
2708  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
2709  types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
2710  __kmp_topology =
2711  kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types);
2712 
2713  // Insert equivalent cache types if they exist
2714  kmp_cache_info_t cache_info;
2715  for (size_t i = 0; i < cache_info.get_depth(); ++i) {
2716  const kmp_cache_info_t::info_t &info = cache_info[i];
2717  unsigned cache_mask = info.mask;
2718  unsigned cache_level = info.level;
2719  for (unsigned j = 0; j < levels_index; ++j) {
2720  unsigned hw_cache_mask = levels[j].cache_mask;
2721  kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level);
2722  if (hw_cache_mask == cache_mask && j < levels_index - 1) {
2723  kmp_hw_t type =
2724  __kmp_intel_type_2_topology_type(levels[j + 1].level_type);
2725  __kmp_topology->set_equivalent_type(cache_type, type);
2726  }
2727  }
2728  }
2729 
2730  // From here on, we can assume that it is safe to call
2731  // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
2732  // __kmp_affinity.type = affinity_none.
2733 
2734  // Save the affinity mask for the current thread.
2735  kmp_affinity_raii_t previous_affinity;
2736 
2737  // Run through each of the available contexts, binding the current thread
2738  // to it, and obtaining the pertinent information using the cpuid instr.
2739  unsigned int proc;
2740  int hw_thread_index = 0;
2741  KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
2742  cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
2743  unsigned my_levels_index;
2744 
2745  // Skip this proc if it is not included in the machine model.
2746  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
2747  continue;
2748  }
2749  KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc);
2750 
2751  __kmp_affinity_dispatch->bind_thread(proc);
2752 
2753  // New algorithm
2754  __kmp_x86_cpuid(topology_leaf, 0, &buf);
2755  apic_id = buf.edx;
2756  kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index);
2757  my_levels_index =
2758  __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
2759  if (my_levels_index == 0 || my_levels_index != levels_index) {
2760  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
2761  return false;
2762  }
2763  hw_thread.clear();
2764  hw_thread.os_id = proc;
2765  // Put in topology information
2766  for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
2767  hw_thread.ids[idx] = apic_id & my_levels[j].mask;
2768  if (j > 0) {
2769  hw_thread.ids[idx] >>= my_levels[j - 1].mask_width;
2770  }
2771  }
2772  // Hybrid information
2773  if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) {
2774  kmp_hw_core_type_t type;
2775  unsigned native_model_id;
2776  int efficiency;
2777  __kmp_get_hybrid_info(&type, &efficiency, &native_model_id);
2778  hw_thread.attrs.set_core_type(type);
2779  hw_thread.attrs.set_core_eff(efficiency);
2780  }
2781  hw_thread_index++;
2782  }
2783  KMP_ASSERT(hw_thread_index > 0);
2784  __kmp_topology->sort_ids();
2785  if (!__kmp_topology->check_ids()) {
2786  kmp_topology_t::deallocate(__kmp_topology);
2787  __kmp_topology = nullptr;
2788  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
2789  return false;
2790  }
2791  return true;
2792 }
2793 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2794 
2795 #define osIdIndex 0
2796 #define threadIdIndex 1
2797 #define coreIdIndex 2
2798 #define pkgIdIndex 3
2799 #define nodeIdIndex 4
2800 
2801 typedef unsigned *ProcCpuInfo;
2802 static unsigned maxIndex = pkgIdIndex;
2803 
2804 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
2805  const void *b) {
2806  unsigned i;
2807  const unsigned *aa = *(unsigned *const *)a;
2808  const unsigned *bb = *(unsigned *const *)b;
2809  for (i = maxIndex;; i--) {
2810  if (aa[i] < bb[i])
2811  return -1;
2812  if (aa[i] > bb[i])
2813  return 1;
2814  if (i == osIdIndex)
2815  break;
2816  }
2817  return 0;
2818 }
2819 
2820 #if KMP_USE_HIER_SCHED
2821 // Set the array sizes for the hierarchy layers
2822 static void __kmp_dispatch_set_hierarchy_values() {
2823  // Set the maximum number of L1's to number of cores
2824  // Set the maximum number of L2's to either number of cores / 2 for
2825  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
2826  // Or the number of cores for Intel(R) Xeon(R) processors
2827  // Set the maximum number of NUMA nodes and L3's to number of packages
2828  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
2829  nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2830  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
2831 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2832  KMP_MIC_SUPPORTED
2833  if (__kmp_mic_type >= mic3)
2834  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
2835  else
2836 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2837  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
2838  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
2839  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
2840  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
2841  // Set the number of threads per unit
2842  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
2843  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
2844  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
2845  __kmp_nThreadsPerCore;
2846 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
2847  KMP_MIC_SUPPORTED
2848  if (__kmp_mic_type >= mic3)
2849  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2850  2 * __kmp_nThreadsPerCore;
2851  else
2852 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2853  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2854  __kmp_nThreadsPerCore;
2855  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2856  nCoresPerPkg * __kmp_nThreadsPerCore;
2857  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2858  nCoresPerPkg * __kmp_nThreadsPerCore;
2859  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2860  nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2861 }
2862 
2863 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2864 // i.e., this thread's L1 or this thread's L2, etc.
2865 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2866  int index = type + 1;
2867  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2868  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2869  if (type == kmp_hier_layer_e::LAYER_THREAD)
2870  return tid;
2871  else if (type == kmp_hier_layer_e::LAYER_LOOP)
2872  return 0;
2873  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2874  if (tid >= num_hw_threads)
2875  tid = tid % num_hw_threads;
2876  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2877 }
2878 
2879 // Return the number of t1's per t2
2880 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2881  int i1 = t1 + 1;
2882  int i2 = t2 + 1;
2883  KMP_DEBUG_ASSERT(i1 <= i2);
2884  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2885  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2886  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2887  // (nthreads/t2) / (nthreads/t1) = t1 / t2
2888  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2889 }
2890 #endif // KMP_USE_HIER_SCHED
2891 
2892 static inline const char *__kmp_cpuinfo_get_filename() {
2893  const char *filename;
2894  if (__kmp_cpuinfo_file != nullptr)
2895  filename = __kmp_cpuinfo_file;
2896  else
2897  filename = "/proc/cpuinfo";
2898  return filename;
2899 }
2900 
2901 static inline const char *__kmp_cpuinfo_get_envvar() {
2902  const char *envvar = nullptr;
2903  if (__kmp_cpuinfo_file != nullptr)
2904  envvar = "KMP_CPUINFO_FILE";
2905  return envvar;
2906 }
2907 
2908 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2909 // affinity map. On AIX, the map is obtained through system SRAD (Scheduler
2910 // Resource Allocation Domain).
2911 static bool __kmp_affinity_create_cpuinfo_map(int *line,
2912  kmp_i18n_id_t *const msg_id) {
2913  *msg_id = kmp_i18n_null;
2914 
2915 #if KMP_OS_AIX
2916  unsigned num_records = __kmp_xproc;
2917 #else
2918  const char *filename = __kmp_cpuinfo_get_filename();
2919  const char *envvar = __kmp_cpuinfo_get_envvar();
2920 
2921  if (__kmp_affinity.flags.verbose) {
2922  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
2923  }
2924 
2925  kmp_safe_raii_file_t f(filename, "r", envvar);
2926 
2927  // Scan of the file, and count the number of "processor" (osId) fields,
2928  // and find the highest value of <n> for a node_<n> field.
2929  char buf[256];
2930  unsigned num_records = 0;
2931  while (!feof(f)) {
2932  buf[sizeof(buf) - 1] = 1;
2933  if (!fgets(buf, sizeof(buf), f)) {
2934  // Read errors presumably because of EOF
2935  break;
2936  }
2937 
2938  char s1[] = "processor";
2939  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2940  num_records++;
2941  continue;
2942  }
2943 
2944  // FIXME - this will match "node_<n> <garbage>"
2945  unsigned level;
2946  if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2947  // validate the input fisrt:
2948  if (level > (unsigned)__kmp_xproc) { // level is too big
2949  level = __kmp_xproc;
2950  }
2951  if (nodeIdIndex + level >= maxIndex) {
2952  maxIndex = nodeIdIndex + level;
2953  }
2954  continue;
2955  }
2956  }
2957 
2958  // Check for empty file / no valid processor records, or too many. The number
2959  // of records can't exceed the number of valid bits in the affinity mask.
2960  if (num_records == 0) {
2961  *msg_id = kmp_i18n_str_NoProcRecords;
2962  return false;
2963  }
2964  if (num_records > (unsigned)__kmp_xproc) {
2965  *msg_id = kmp_i18n_str_TooManyProcRecords;
2966  return false;
2967  }
2968 
2969  // Set the file pointer back to the beginning, so that we can scan the file
2970  // again, this time performing a full parse of the data. Allocate a vector of
2971  // ProcCpuInfo object, where we will place the data. Adding an extra element
2972  // at the end allows us to remove a lot of extra checks for termination
2973  // conditions.
2974  if (fseek(f, 0, SEEK_SET) != 0) {
2975  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
2976  return false;
2977  }
2978 #endif // KMP_OS_AIX
2979 
2980  // Allocate the array of records to store the proc info in. The dummy
2981  // element at the end makes the logic in filling them out easier to code.
2982  unsigned **threadInfo =
2983  (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
2984  unsigned i;
2985  for (i = 0; i <= num_records; i++) {
2986  threadInfo[i] =
2987  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2988  }
2989 
2990 #define CLEANUP_THREAD_INFO \
2991  for (i = 0; i <= num_records; i++) { \
2992  __kmp_free(threadInfo[i]); \
2993  } \
2994  __kmp_free(threadInfo);
2995 
2996  // A value of UINT_MAX means that we didn't find the field
2997  unsigned __index;
2998 
2999 #define INIT_PROC_INFO(p) \
3000  for (__index = 0; __index <= maxIndex; __index++) { \
3001  (p)[__index] = UINT_MAX; \
3002  }
3003 
3004  for (i = 0; i <= num_records; i++) {
3005  INIT_PROC_INFO(threadInfo[i]);
3006  }
3007 
3008 #if KMP_OS_AIX
3009  int smt_threads;
3010  lpar_info_format1_t cpuinfo;
3011  unsigned num_avail = __kmp_xproc;
3012 
3013  if (__kmp_affinity.flags.verbose)
3014  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "system info for topology");
3015 
3016  // Get the number of SMT threads per core.
3017  int retval =
3018  lpar_get_info(LPAR_INFO_FORMAT1, &cpuinfo, sizeof(lpar_info_format1_t));
3019  if (!retval)
3020  smt_threads = cpuinfo.smt_threads;
3021  else {
3022  CLEANUP_THREAD_INFO;
3023  *msg_id = kmp_i18n_str_UnknownTopology;
3024  return false;
3025  }
3026 
3027  // Allocate a resource set containing available system resourses.
3028  rsethandle_t sys_rset = rs_alloc(RS_SYSTEM);
3029  if (sys_rset == NULL) {
3030  CLEANUP_THREAD_INFO;
3031  *msg_id = kmp_i18n_str_UnknownTopology;
3032  return false;
3033  }
3034  // Allocate a resource set for the SRAD info.
3035  rsethandle_t srad = rs_alloc(RS_EMPTY);
3036  if (srad == NULL) {
3037  rs_free(sys_rset);
3038  CLEANUP_THREAD_INFO;
3039  *msg_id = kmp_i18n_str_UnknownTopology;
3040  return false;
3041  }
3042 
3043  // Get the SRAD system detail level.
3044  int sradsdl = rs_getinfo(NULL, R_SRADSDL, 0);
3045  if (sradsdl < 0) {
3046  rs_free(sys_rset);
3047  rs_free(srad);
3048  CLEANUP_THREAD_INFO;
3049  *msg_id = kmp_i18n_str_UnknownTopology;
3050  return false;
3051  }
3052  // Get the number of RADs at that SRAD SDL.
3053  int num_rads = rs_numrads(sys_rset, sradsdl, 0);
3054  if (num_rads < 0) {
3055  rs_free(sys_rset);
3056  rs_free(srad);
3057  CLEANUP_THREAD_INFO;
3058  *msg_id = kmp_i18n_str_UnknownTopology;
3059  return false;
3060  }
3061 
3062  // Get the maximum number of procs that may be contained in a resource set.
3063  int max_procs = rs_getinfo(NULL, R_MAXPROCS, 0);
3064  if (max_procs < 0) {
3065  rs_free(sys_rset);
3066  rs_free(srad);
3067  CLEANUP_THREAD_INFO;
3068  *msg_id = kmp_i18n_str_UnknownTopology;
3069  return false;
3070  }
3071 
3072  int cur_rad = 0;
3073  int num_set = 0;
3074  for (int srad_idx = 0; cur_rad < num_rads && srad_idx < VMI_MAXRADS;
3075  ++srad_idx) {
3076  // Check if the SRAD is available in the RSET.
3077  if (rs_getrad(sys_rset, srad, sradsdl, srad_idx, 0) < 0)
3078  continue;
3079 
3080  for (int cpu = 0; cpu < max_procs; cpu++) {
3081  // Set the info for the cpu if it is in the SRAD.
3082  if (rs_op(RS_TESTRESOURCE, srad, NULL, R_PROCS, cpu)) {
3083  threadInfo[cpu][osIdIndex] = cpu;
3084  threadInfo[cpu][pkgIdIndex] = cur_rad;
3085  threadInfo[cpu][coreIdIndex] = cpu / smt_threads;
3086  ++num_set;
3087  if (num_set >= num_avail) {
3088  // Done if all available CPUs have been set.
3089  break;
3090  }
3091  }
3092  }
3093  ++cur_rad;
3094  }
3095  rs_free(sys_rset);
3096  rs_free(srad);
3097 
3098  // The topology is already sorted.
3099 
3100 #else // !KMP_OS_AIX
3101  unsigned num_avail = 0;
3102  *line = 0;
3103 #if KMP_ARCH_S390X
3104  bool reading_s390x_sys_info = true;
3105 #endif
3106  while (!feof(f)) {
3107  // Create an inner scoping level, so that all the goto targets at the end of
3108  // the loop appear in an outer scoping level. This avoids warnings about
3109  // jumping past an initialization to a target in the same block.
3110  {
3111  buf[sizeof(buf) - 1] = 1;
3112  bool long_line = false;
3113  if (!fgets(buf, sizeof(buf), f)) {
3114  // Read errors presumably because of EOF
3115  // If there is valid data in threadInfo[num_avail], then fake
3116  // a blank line in ensure that the last address gets parsed.
3117  bool valid = false;
3118  for (i = 0; i <= maxIndex; i++) {
3119  if (threadInfo[num_avail][i] != UINT_MAX) {
3120  valid = true;
3121  }
3122  }
3123  if (!valid) {
3124  break;
3125  }
3126  buf[0] = 0;
3127  } else if (!buf[sizeof(buf) - 1]) {
3128  // The line is longer than the buffer. Set a flag and don't
3129  // emit an error if we were going to ignore the line, anyway.
3130  long_line = true;
3131 
3132 #define CHECK_LINE \
3133  if (long_line) { \
3134  CLEANUP_THREAD_INFO; \
3135  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
3136  return false; \
3137  }
3138  }
3139  (*line)++;
3140 
3141 #if KMP_ARCH_LOONGARCH64
3142  // The parsing logic of /proc/cpuinfo in this function highly depends on
3143  // the blank lines between each processor info block. But on LoongArch a
3144  // blank line exists before the first processor info block (i.e. after the
3145  // "system type" line). This blank line was added because the "system
3146  // type" line is unrelated to any of the CPUs. We must skip this line so
3147  // that the original logic works on LoongArch.
3148  if (*buf == '\n' && *line == 2)
3149  continue;
3150 #endif
3151 #if KMP_ARCH_S390X
3152  // s390x /proc/cpuinfo starts with a variable number of lines containing
3153  // the overall system information. Skip them.
3154  if (reading_s390x_sys_info) {
3155  if (*buf == '\n')
3156  reading_s390x_sys_info = false;
3157  continue;
3158  }
3159 #endif
3160 
3161 #if KMP_ARCH_S390X
3162  char s1[] = "cpu number";
3163 #else
3164  char s1[] = "processor";
3165 #endif
3166  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
3167  CHECK_LINE;
3168  char *p = strchr(buf + sizeof(s1) - 1, ':');
3169  unsigned val;
3170  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3171  goto no_val;
3172  if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
3173 #if KMP_ARCH_AARCH64
3174  // Handle the old AArch64 /proc/cpuinfo layout differently,
3175  // it contains all of the 'processor' entries listed in a
3176  // single 'Processor' section, therefore the normal looking
3177  // for duplicates in that section will always fail.
3178  num_avail++;
3179 #else
3180  goto dup_field;
3181 #endif
3182  threadInfo[num_avail][osIdIndex] = val;
3183 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
3184  char path[256];
3185  KMP_SNPRINTF(
3186  path, sizeof(path),
3187  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
3188  threadInfo[num_avail][osIdIndex]);
3189  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
3190 
3191 #if KMP_ARCH_S390X
3192  // Disambiguate physical_package_id.
3193  unsigned book_id;
3194  KMP_SNPRINTF(path, sizeof(path),
3195  "/sys/devices/system/cpu/cpu%u/topology/book_id",
3196  threadInfo[num_avail][osIdIndex]);
3197  __kmp_read_from_file(path, "%u", &book_id);
3198  threadInfo[num_avail][pkgIdIndex] |= (book_id << 8);
3199 
3200  unsigned drawer_id;
3201  KMP_SNPRINTF(path, sizeof(path),
3202  "/sys/devices/system/cpu/cpu%u/topology/drawer_id",
3203  threadInfo[num_avail][osIdIndex]);
3204  __kmp_read_from_file(path, "%u", &drawer_id);
3205  threadInfo[num_avail][pkgIdIndex] |= (drawer_id << 16);
3206 #endif
3207 
3208  KMP_SNPRINTF(path, sizeof(path),
3209  "/sys/devices/system/cpu/cpu%u/topology/core_id",
3210  threadInfo[num_avail][osIdIndex]);
3211  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
3212  continue;
3213 #else
3214  }
3215  char s2[] = "physical id";
3216  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
3217  CHECK_LINE;
3218  char *p = strchr(buf + sizeof(s2) - 1, ':');
3219  unsigned val;
3220  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3221  goto no_val;
3222  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
3223  goto dup_field;
3224  threadInfo[num_avail][pkgIdIndex] = val;
3225  continue;
3226  }
3227  char s3[] = "core id";
3228  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
3229  CHECK_LINE;
3230  char *p = strchr(buf + sizeof(s3) - 1, ':');
3231  unsigned val;
3232  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3233  goto no_val;
3234  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
3235  goto dup_field;
3236  threadInfo[num_avail][coreIdIndex] = val;
3237  continue;
3238 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
3239  }
3240  char s4[] = "thread id";
3241  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
3242  CHECK_LINE;
3243  char *p = strchr(buf + sizeof(s4) - 1, ':');
3244  unsigned val;
3245  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3246  goto no_val;
3247  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
3248  goto dup_field;
3249  threadInfo[num_avail][threadIdIndex] = val;
3250  continue;
3251  }
3252  unsigned level;
3253  if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
3254  CHECK_LINE;
3255  char *p = strchr(buf + sizeof(s4) - 1, ':');
3256  unsigned val;
3257  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
3258  goto no_val;
3259  // validate the input before using level:
3260  if (level > (unsigned)__kmp_xproc) { // level is too big
3261  level = __kmp_xproc;
3262  }
3263  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
3264  goto dup_field;
3265  threadInfo[num_avail][nodeIdIndex + level] = val;
3266  continue;
3267  }
3268 
3269  // We didn't recognize the leading token on the line. There are lots of
3270  // leading tokens that we don't recognize - if the line isn't empty, go on
3271  // to the next line.
3272  if ((*buf != 0) && (*buf != '\n')) {
3273  // If the line is longer than the buffer, read characters
3274  // until we find a newline.
3275  if (long_line) {
3276  int ch;
3277  while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
3278  ;
3279  }
3280  continue;
3281  }
3282 
3283  // A newline has signalled the end of the processor record.
3284  // Check that there aren't too many procs specified.
3285  if ((int)num_avail == __kmp_xproc) {
3286  CLEANUP_THREAD_INFO;
3287  *msg_id = kmp_i18n_str_TooManyEntries;
3288  return false;
3289  }
3290 
3291  // Check for missing fields. The osId field must be there, and we
3292  // currently require that the physical id field is specified, also.
3293  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
3294  CLEANUP_THREAD_INFO;
3295  *msg_id = kmp_i18n_str_MissingProcField;
3296  return false;
3297  }
3298  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
3299  CLEANUP_THREAD_INFO;
3300  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
3301  return false;
3302  }
3303 
3304  // Skip this proc if it is not included in the machine model.
3305  if (KMP_AFFINITY_CAPABLE() &&
3306  !KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
3307  __kmp_affin_fullMask)) {
3308  INIT_PROC_INFO(threadInfo[num_avail]);
3309  continue;
3310  }
3311 
3312  // We have a successful parse of this proc's info.
3313  // Increment the counter, and prepare for the next proc.
3314  num_avail++;
3315  KMP_ASSERT(num_avail <= num_records);
3316  INIT_PROC_INFO(threadInfo[num_avail]);
3317  }
3318  continue;
3319 
3320  no_val:
3321  CLEANUP_THREAD_INFO;
3322  *msg_id = kmp_i18n_str_MissingValCpuinfo;
3323  return false;
3324 
3325  dup_field:
3326  CLEANUP_THREAD_INFO;
3327  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
3328  return false;
3329  }
3330  *line = 0;
3331 
3332 #if KMP_MIC && REDUCE_TEAM_SIZE
3333  unsigned teamSize = 0;
3334 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3335 
3336  // check for num_records == __kmp_xproc ???
3337 
3338  // If it is configured to omit the package level when there is only a single
3339  // package, the logic at the end of this routine won't work if there is only a
3340  // single thread
3341  KMP_ASSERT(num_avail > 0);
3342  KMP_ASSERT(num_avail <= num_records);
3343 
3344  // Sort the threadInfo table by physical Id.
3345  qsort(threadInfo, num_avail, sizeof(*threadInfo),
3346  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
3347 
3348 #endif // KMP_OS_AIX
3349 
3350  // The table is now sorted by pkgId / coreId / threadId, but we really don't
3351  // know the radix of any of the fields. pkgId's may be sparsely assigned among
3352  // the chips on a system. Although coreId's are usually assigned
3353  // [0 .. coresPerPkg-1] and threadId's are usually assigned
3354  // [0..threadsPerCore-1], we don't want to make any such assumptions.
3355  //
3356  // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
3357  // total # packages) are at this point - we want to determine that now. We
3358  // only have an upper bound on the first two figures.
3359  unsigned *counts =
3360  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3361  unsigned *maxCt =
3362  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3363  unsigned *totals =
3364  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3365  unsigned *lastId =
3366  (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
3367 
3368  bool assign_thread_ids = false;
3369  unsigned threadIdCt;
3370  unsigned index;
3371 
3372 restart_radix_check:
3373  threadIdCt = 0;
3374 
3375  // Initialize the counter arrays with data from threadInfo[0].
3376  if (assign_thread_ids) {
3377  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
3378  threadInfo[0][threadIdIndex] = threadIdCt++;
3379  } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
3380  threadIdCt = threadInfo[0][threadIdIndex] + 1;
3381  }
3382  }
3383  for (index = 0; index <= maxIndex; index++) {
3384  counts[index] = 1;
3385  maxCt[index] = 1;
3386  totals[index] = 1;
3387  lastId[index] = threadInfo[0][index];
3388  ;
3389  }
3390 
3391  // Run through the rest of the OS procs.
3392  for (i = 1; i < num_avail; i++) {
3393  // Find the most significant index whose id differs from the id for the
3394  // previous OS proc.
3395  for (index = maxIndex; index >= threadIdIndex; index--) {
3396  if (assign_thread_ids && (index == threadIdIndex)) {
3397  // Auto-assign the thread id field if it wasn't specified.
3398  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3399  threadInfo[i][threadIdIndex] = threadIdCt++;
3400  }
3401  // Apparently the thread id field was specified for some entries and not
3402  // others. Start the thread id counter off at the next higher thread id.
3403  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3404  threadIdCt = threadInfo[i][threadIdIndex] + 1;
3405  }
3406  }
3407  if (threadInfo[i][index] != lastId[index]) {
3408  // Run through all indices which are less significant, and reset the
3409  // counts to 1. At all levels up to and including index, we need to
3410  // increment the totals and record the last id.
3411  unsigned index2;
3412  for (index2 = threadIdIndex; index2 < index; index2++) {
3413  totals[index2]++;
3414  if (counts[index2] > maxCt[index2]) {
3415  maxCt[index2] = counts[index2];
3416  }
3417  counts[index2] = 1;
3418  lastId[index2] = threadInfo[i][index2];
3419  }
3420  counts[index]++;
3421  totals[index]++;
3422  lastId[index] = threadInfo[i][index];
3423 
3424  if (assign_thread_ids && (index > threadIdIndex)) {
3425 
3426 #if KMP_MIC && REDUCE_TEAM_SIZE
3427  // The default team size is the total #threads in the machine
3428  // minus 1 thread for every core that has 3 or more threads.
3429  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3430 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3431 
3432  // Restart the thread counter, as we are on a new core.
3433  threadIdCt = 0;
3434 
3435  // Auto-assign the thread id field if it wasn't specified.
3436  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
3437  threadInfo[i][threadIdIndex] = threadIdCt++;
3438  }
3439 
3440  // Apparently the thread id field was specified for some entries and
3441  // not others. Start the thread id counter off at the next higher
3442  // thread id.
3443  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
3444  threadIdCt = threadInfo[i][threadIdIndex] + 1;
3445  }
3446  }
3447  break;
3448  }
3449  }
3450  if (index < threadIdIndex) {
3451  // If thread ids were specified, it is an error if they are not unique.
3452  // Also, check that we waven't already restarted the loop (to be safe -
3453  // shouldn't need to).
3454  if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
3455  __kmp_free(lastId);
3456  __kmp_free(totals);
3457  __kmp_free(maxCt);
3458  __kmp_free(counts);
3459  CLEANUP_THREAD_INFO;
3460  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3461  return false;
3462  }
3463 
3464  // If the thread ids were not specified and we see entries that
3465  // are duplicates, start the loop over and assign the thread ids manually.
3466  assign_thread_ids = true;
3467  goto restart_radix_check;
3468  }
3469  }
3470 
3471 #if KMP_MIC && REDUCE_TEAM_SIZE
3472  // The default team size is the total #threads in the machine
3473  // minus 1 thread for every core that has 3 or more threads.
3474  teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
3475 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3476 
3477  for (index = threadIdIndex; index <= maxIndex; index++) {
3478  if (counts[index] > maxCt[index]) {
3479  maxCt[index] = counts[index];
3480  }
3481  }
3482 
3483  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
3484  nCoresPerPkg = maxCt[coreIdIndex];
3485  nPackages = totals[pkgIdIndex];
3486 
3487  // When affinity is off, this routine will still be called to set
3488  // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
3489  // Make sure all these vars are set correctly, and return now if affinity is
3490  // not enabled.
3491  __kmp_ncores = totals[coreIdIndex];
3492  if (!KMP_AFFINITY_CAPABLE()) {
3493  KMP_ASSERT(__kmp_affinity.type == affinity_none);
3494  return true;
3495  }
3496 
3497 #if KMP_MIC && REDUCE_TEAM_SIZE
3498  // Set the default team size.
3499  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
3500  __kmp_dflt_team_nth = teamSize;
3501  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
3502  "__kmp_dflt_team_nth = %d\n",
3503  __kmp_dflt_team_nth));
3504  }
3505 #endif // KMP_MIC && REDUCE_TEAM_SIZE
3506 
3507  KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
3508 
3509  // Count the number of levels which have more nodes at that level than at the
3510  // parent's level (with there being an implicit root node of the top level).
3511  // This is equivalent to saying that there is at least one node at this level
3512  // which has a sibling. These levels are in the map, and the package level is
3513  // always in the map.
3514  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
3515  for (index = threadIdIndex; index < maxIndex; index++) {
3516  KMP_ASSERT(totals[index] >= totals[index + 1]);
3517  inMap[index] = (totals[index] > totals[index + 1]);
3518  }
3519  inMap[maxIndex] = (totals[maxIndex] > 1);
3520  inMap[pkgIdIndex] = true;
3521  inMap[coreIdIndex] = true;
3522  inMap[threadIdIndex] = true;
3523 
3524  int depth = 0;
3525  int idx = 0;
3526  kmp_hw_t types[KMP_HW_LAST];
3527  int pkgLevel = -1;
3528  int coreLevel = -1;
3529  int threadLevel = -1;
3530  for (index = threadIdIndex; index <= maxIndex; index++) {
3531  if (inMap[index]) {
3532  depth++;
3533  }
3534  }
3535  if (inMap[pkgIdIndex]) {
3536  pkgLevel = idx;
3537  types[idx++] = KMP_HW_SOCKET;
3538  }
3539  if (inMap[coreIdIndex]) {
3540  coreLevel = idx;
3541  types[idx++] = KMP_HW_CORE;
3542  }
3543  if (inMap[threadIdIndex]) {
3544  threadLevel = idx;
3545  types[idx++] = KMP_HW_THREAD;
3546  }
3547  KMP_ASSERT(depth > 0);
3548 
3549  // Construct the data structure that is to be returned.
3550  __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types);
3551 
3552  for (i = 0; i < num_avail; ++i) {
3553  unsigned os = threadInfo[i][osIdIndex];
3554  int src_index;
3555  kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
3556  hw_thread.clear();
3557  hw_thread.os_id = os;
3558 
3559  idx = 0;
3560  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
3561  if (!inMap[src_index]) {
3562  continue;
3563  }
3564  if (src_index == pkgIdIndex) {
3565  hw_thread.ids[pkgLevel] = threadInfo[i][src_index];
3566  } else if (src_index == coreIdIndex) {
3567  hw_thread.ids[coreLevel] = threadInfo[i][src_index];
3568  } else if (src_index == threadIdIndex) {
3569  hw_thread.ids[threadLevel] = threadInfo[i][src_index];
3570  }
3571  }
3572  }
3573 
3574  __kmp_free(inMap);
3575  __kmp_free(lastId);
3576  __kmp_free(totals);
3577  __kmp_free(maxCt);
3578  __kmp_free(counts);
3579  CLEANUP_THREAD_INFO;
3580  __kmp_topology->sort_ids();
3581  if (!__kmp_topology->check_ids()) {
3582  kmp_topology_t::deallocate(__kmp_topology);
3583  __kmp_topology = nullptr;
3584  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
3585  return false;
3586  }
3587  return true;
3588 }
3589 
3590 // Create and return a table of affinity masks, indexed by OS thread ID.
3591 // This routine handles OR'ing together all the affinity masks of threads
3592 // that are sufficiently close, if granularity > fine.
3593 template <typename FindNextFunctionType>
3594 static void __kmp_create_os_id_masks(unsigned *numUnique,
3595  kmp_affinity_t &affinity,
3596  FindNextFunctionType find_next) {
3597  // First form a table of affinity masks in order of OS thread id.
3598  int maxOsId;
3599  int i;
3600  int numAddrs = __kmp_topology->get_num_hw_threads();
3601  int depth = __kmp_topology->get_depth();
3602  const char *env_var = __kmp_get_affinity_env_var(affinity);
3603  KMP_ASSERT(numAddrs);
3604  KMP_ASSERT(depth);
3605 
3606  i = find_next(-1);
3607  // If could not find HW thread location with attributes, then return and
3608  // fallback to increment find_next and disregard core attributes.
3609  if (i >= numAddrs)
3610  return;
3611 
3612  maxOsId = 0;
3613  for (i = numAddrs - 1;; --i) {
3614  int osId = __kmp_topology->at(i).os_id;
3615  if (osId > maxOsId) {
3616  maxOsId = osId;
3617  }
3618  if (i == 0)
3619  break;
3620  }
3621  affinity.num_os_id_masks = maxOsId + 1;
3622  KMP_CPU_ALLOC_ARRAY(affinity.os_id_masks, affinity.num_os_id_masks);
3623  KMP_ASSERT(affinity.gran_levels >= 0);
3624  if (affinity.flags.verbose && (affinity.gran_levels > 0)) {
3625  KMP_INFORM(ThreadsMigrate, env_var, affinity.gran_levels);
3626  }
3627  if (affinity.gran_levels >= (int)depth) {
3628  KMP_AFF_WARNING(affinity, AffThreadsMayMigrate);
3629  }
3630 
3631  // Run through the table, forming the masks for all threads on each core.
3632  // Threads on the same core will have identical kmp_hw_thread_t objects, not
3633  // considering the last level, which must be the thread id. All threads on a
3634  // core will appear consecutively.
3635  int unique = 0;
3636  int j = 0; // index of 1st thread on core
3637  int leader = 0;
3638  kmp_affin_mask_t *sum;
3639  KMP_CPU_ALLOC_ON_STACK(sum);
3640  KMP_CPU_ZERO(sum);
3641 
3642  i = j = leader = find_next(-1);
3643  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3644  kmp_full_mask_modifier_t full_mask;
3645  for (i = find_next(i); i < numAddrs; i = find_next(i)) {
3646  // If this thread is sufficiently close to the leader (within the
3647  // granularity setting), then set the bit for this os thread in the
3648  // affinity mask for this group, and go on to the next thread.
3649  if (__kmp_topology->is_close(leader, i, affinity)) {
3650  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3651  continue;
3652  }
3653 
3654  // For every thread in this group, copy the mask to the thread's entry in
3655  // the OS Id mask table. Mark the first address as a leader.
3656  for (; j < i; j = find_next(j)) {
3657  int osId = __kmp_topology->at(j).os_id;
3658  KMP_DEBUG_ASSERT(osId <= maxOsId);
3659  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3660  KMP_CPU_COPY(mask, sum);
3661  __kmp_topology->at(j).leader = (j == leader);
3662  }
3663  unique++;
3664 
3665  // Start a new mask.
3666  leader = i;
3667  full_mask.include(sum);
3668  KMP_CPU_ZERO(sum);
3669  KMP_CPU_SET(__kmp_topology->at(i).os_id, sum);
3670  }
3671 
3672  // For every thread in last group, copy the mask to the thread's
3673  // entry in the OS Id mask table.
3674  for (; j < i; j = find_next(j)) {
3675  int osId = __kmp_topology->at(j).os_id;
3676  KMP_DEBUG_ASSERT(osId <= maxOsId);
3677  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.os_id_masks, osId);
3678  KMP_CPU_COPY(mask, sum);
3679  __kmp_topology->at(j).leader = (j == leader);
3680  }
3681  full_mask.include(sum);
3682  unique++;
3683  KMP_CPU_FREE_FROM_STACK(sum);
3684 
3685  // See if the OS Id mask table further restricts or changes the full mask
3686  if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
3687  __kmp_topology->print(env_var);
3688  }
3689 
3690  *numUnique = unique;
3691 }
3692 
3693 // Stuff for the affinity proclist parsers. It's easier to declare these vars
3694 // as file-static than to try and pass them through the calling sequence of
3695 // the recursive-descent OMP_PLACES parser.
3696 static kmp_affin_mask_t *newMasks;
3697 static int numNewMasks;
3698 static int nextNewMask;
3699 
3700 #define ADD_MASK(_mask) \
3701  { \
3702  if (nextNewMask >= numNewMasks) { \
3703  int i; \
3704  numNewMasks *= 2; \
3705  kmp_affin_mask_t *temp; \
3706  KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
3707  for (i = 0; i < numNewMasks / 2; i++) { \
3708  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
3709  kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
3710  KMP_CPU_COPY(dest, src); \
3711  } \
3712  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
3713  newMasks = temp; \
3714  } \
3715  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
3716  nextNewMask++; \
3717  }
3718 
3719 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
3720  { \
3721  if (((_osId) > _maxOsId) || \
3722  (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
3723  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, _osId); \
3724  } else { \
3725  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
3726  } \
3727  }
3728 
3729 // Re-parse the proclist (for the explicit affinity type), and form the list
3730 // of affinity newMasks indexed by gtid.
3731 static void __kmp_affinity_process_proclist(kmp_affinity_t &affinity) {
3732  int i;
3733  kmp_affin_mask_t **out_masks = &affinity.masks;
3734  unsigned *out_numMasks = &affinity.num_masks;
3735  const char *proclist = affinity.proclist;
3736  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3737  int maxOsId = affinity.num_os_id_masks - 1;
3738  const char *scan = proclist;
3739  const char *next = proclist;
3740 
3741  // We use malloc() for the temporary mask vector, so that we can use
3742  // realloc() to extend it.
3743  numNewMasks = 2;
3744  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3745  nextNewMask = 0;
3746  kmp_affin_mask_t *sumMask;
3747  KMP_CPU_ALLOC(sumMask);
3748  int setSize = 0;
3749 
3750  for (;;) {
3751  int start, end, stride;
3752 
3753  SKIP_WS(scan);
3754  next = scan;
3755  if (*next == '\0') {
3756  break;
3757  }
3758 
3759  if (*next == '{') {
3760  int num;
3761  setSize = 0;
3762  next++; // skip '{'
3763  SKIP_WS(next);
3764  scan = next;
3765 
3766  // Read the first integer in the set.
3767  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
3768  SKIP_DIGITS(next);
3769  num = __kmp_str_to_int(scan, *next);
3770  KMP_ASSERT2(num >= 0, "bad explicit proc list");
3771 
3772  // Copy the mask for that osId to the sum (union) mask.
3773  if ((num > maxOsId) ||
3774  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3775  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3776  KMP_CPU_ZERO(sumMask);
3777  } else {
3778  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3779  setSize = 1;
3780  }
3781 
3782  for (;;) {
3783  // Check for end of set.
3784  SKIP_WS(next);
3785  if (*next == '}') {
3786  next++; // skip '}'
3787  break;
3788  }
3789 
3790  // Skip optional comma.
3791  if (*next == ',') {
3792  next++;
3793  }
3794  SKIP_WS(next);
3795 
3796  // Read the next integer in the set.
3797  scan = next;
3798  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3799 
3800  SKIP_DIGITS(next);
3801  num = __kmp_str_to_int(scan, *next);
3802  KMP_ASSERT2(num >= 0, "bad explicit proc list");
3803 
3804  // Add the mask for that osId to the sum mask.
3805  if ((num > maxOsId) ||
3806  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3807  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
3808  } else {
3809  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
3810  setSize++;
3811  }
3812  }
3813  if (setSize > 0) {
3814  ADD_MASK(sumMask);
3815  }
3816 
3817  SKIP_WS(next);
3818  if (*next == ',') {
3819  next++;
3820  }
3821  scan = next;
3822  continue;
3823  }
3824 
3825  // Read the first integer.
3826  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3827  SKIP_DIGITS(next);
3828  start = __kmp_str_to_int(scan, *next);
3829  KMP_ASSERT2(start >= 0, "bad explicit proc list");
3830  SKIP_WS(next);
3831 
3832  // If this isn't a range, then add a mask to the list and go on.
3833  if (*next != '-') {
3834  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3835 
3836  // Skip optional comma.
3837  if (*next == ',') {
3838  next++;
3839  }
3840  scan = next;
3841  continue;
3842  }
3843 
3844  // This is a range. Skip over the '-' and read in the 2nd int.
3845  next++; // skip '-'
3846  SKIP_WS(next);
3847  scan = next;
3848  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3849  SKIP_DIGITS(next);
3850  end = __kmp_str_to_int(scan, *next);
3851  KMP_ASSERT2(end >= 0, "bad explicit proc list");
3852 
3853  // Check for a stride parameter
3854  stride = 1;
3855  SKIP_WS(next);
3856  if (*next == ':') {
3857  // A stride is specified. Skip over the ':" and read the 3rd int.
3858  int sign = +1;
3859  next++; // skip ':'
3860  SKIP_WS(next);
3861  scan = next;
3862  if (*next == '-') {
3863  sign = -1;
3864  next++;
3865  SKIP_WS(next);
3866  scan = next;
3867  }
3868  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
3869  SKIP_DIGITS(next);
3870  stride = __kmp_str_to_int(scan, *next);
3871  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
3872  stride *= sign;
3873  }
3874 
3875  // Do some range checks.
3876  KMP_ASSERT2(stride != 0, "bad explicit proc list");
3877  if (stride > 0) {
3878  KMP_ASSERT2(start <= end, "bad explicit proc list");
3879  } else {
3880  KMP_ASSERT2(start >= end, "bad explicit proc list");
3881  }
3882  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
3883 
3884  // Add the mask for each OS proc # to the list.
3885  if (stride > 0) {
3886  do {
3887  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3888  start += stride;
3889  } while (start <= end);
3890  } else {
3891  do {
3892  ADD_MASK_OSID(start, osId2Mask, maxOsId);
3893  start += stride;
3894  } while (start >= end);
3895  }
3896 
3897  // Skip optional comma.
3898  SKIP_WS(next);
3899  if (*next == ',') {
3900  next++;
3901  }
3902  scan = next;
3903  }
3904 
3905  *out_numMasks = nextNewMask;
3906  if (nextNewMask == 0) {
3907  *out_masks = NULL;
3908  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3909  return;
3910  }
3911  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3912  for (i = 0; i < nextNewMask; i++) {
3913  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3914  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3915  KMP_CPU_COPY(dest, src);
3916  }
3917  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3918  KMP_CPU_FREE(sumMask);
3919 }
3920 
3921 /*-----------------------------------------------------------------------------
3922 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3923 places. Again, Here is the grammar:
3924 
3925 place_list := place
3926 place_list := place , place_list
3927 place := num
3928 place := place : num
3929 place := place : num : signed
3930 place := { subplacelist }
3931 place := ! place // (lowest priority)
3932 subplace_list := subplace
3933 subplace_list := subplace , subplace_list
3934 subplace := num
3935 subplace := num : num
3936 subplace := num : num : signed
3937 signed := num
3938 signed := + signed
3939 signed := - signed
3940 -----------------------------------------------------------------------------*/
3941 static void __kmp_process_subplace_list(const char **scan,
3942  kmp_affinity_t &affinity, int maxOsId,
3943  kmp_affin_mask_t *tempMask,
3944  int *setSize) {
3945  const char *next;
3946  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
3947 
3948  for (;;) {
3949  int start, count, stride, i;
3950 
3951  // Read in the starting proc id
3952  SKIP_WS(*scan);
3953  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3954  next = *scan;
3955  SKIP_DIGITS(next);
3956  start = __kmp_str_to_int(*scan, *next);
3957  KMP_ASSERT(start >= 0);
3958  *scan = next;
3959 
3960  // valid follow sets are ',' ':' and '}'
3961  SKIP_WS(*scan);
3962  if (**scan == '}' || **scan == ',') {
3963  if ((start > maxOsId) ||
3964  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3965  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3966  } else {
3967  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3968  (*setSize)++;
3969  }
3970  if (**scan == '}') {
3971  break;
3972  }
3973  (*scan)++; // skip ','
3974  continue;
3975  }
3976  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3977  (*scan)++; // skip ':'
3978 
3979  // Read count parameter
3980  SKIP_WS(*scan);
3981  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3982  next = *scan;
3983  SKIP_DIGITS(next);
3984  count = __kmp_str_to_int(*scan, *next);
3985  KMP_ASSERT(count >= 0);
3986  *scan = next;
3987 
3988  // valid follow sets are ',' ':' and '}'
3989  SKIP_WS(*scan);
3990  if (**scan == '}' || **scan == ',') {
3991  for (i = 0; i < count; i++) {
3992  if ((start > maxOsId) ||
3993  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3994  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
3995  break; // don't proliferate warnings for large count
3996  } else {
3997  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3998  start++;
3999  (*setSize)++;
4000  }
4001  }
4002  if (**scan == '}') {
4003  break;
4004  }
4005  (*scan)++; // skip ','
4006  continue;
4007  }
4008  KMP_ASSERT2(**scan == ':', "bad explicit places list");
4009  (*scan)++; // skip ':'
4010 
4011  // Read stride parameter
4012  int sign = +1;
4013  for (;;) {
4014  SKIP_WS(*scan);
4015  if (**scan == '+') {
4016  (*scan)++; // skip '+'
4017  continue;
4018  }
4019  if (**scan == '-') {
4020  sign *= -1;
4021  (*scan)++; // skip '-'
4022  continue;
4023  }
4024  break;
4025  }
4026  SKIP_WS(*scan);
4027  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
4028  next = *scan;
4029  SKIP_DIGITS(next);
4030  stride = __kmp_str_to_int(*scan, *next);
4031  KMP_ASSERT(stride >= 0);
4032  *scan = next;
4033  stride *= sign;
4034 
4035  // valid follow sets are ',' and '}'
4036  SKIP_WS(*scan);
4037  if (**scan == '}' || **scan == ',') {
4038  for (i = 0; i < count; i++) {
4039  if ((start > maxOsId) ||
4040  (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
4041  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, start);
4042  break; // don't proliferate warnings for large count
4043  } else {
4044  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
4045  start += stride;
4046  (*setSize)++;
4047  }
4048  }
4049  if (**scan == '}') {
4050  break;
4051  }
4052  (*scan)++; // skip ','
4053  continue;
4054  }
4055 
4056  KMP_ASSERT2(0, "bad explicit places list");
4057  }
4058 }
4059 
4060 static void __kmp_process_place(const char **scan, kmp_affinity_t &affinity,
4061  int maxOsId, kmp_affin_mask_t *tempMask,
4062  int *setSize) {
4063  const char *next;
4064  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
4065 
4066  // valid follow sets are '{' '!' and num
4067  SKIP_WS(*scan);
4068  if (**scan == '{') {
4069  (*scan)++; // skip '{'
4070  __kmp_process_subplace_list(scan, affinity, maxOsId, tempMask, setSize);
4071  KMP_ASSERT2(**scan == '}', "bad explicit places list");
4072  (*scan)++; // skip '}'
4073  } else if (**scan == '!') {
4074  (*scan)++; // skip '!'
4075  __kmp_process_place(scan, affinity, maxOsId, tempMask, setSize);
4076  KMP_CPU_COMPLEMENT(maxOsId, tempMask);
4077  } else if ((**scan >= '0') && (**scan <= '9')) {
4078  next = *scan;
4079  SKIP_DIGITS(next);
4080  int num = __kmp_str_to_int(*scan, *next);
4081  KMP_ASSERT(num >= 0);
4082  if ((num > maxOsId) ||
4083  (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
4084  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, num);
4085  } else {
4086  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
4087  (*setSize)++;
4088  }
4089  *scan = next; // skip num
4090  } else {
4091  KMP_ASSERT2(0, "bad explicit places list");
4092  }
4093 }
4094 
4095 // static void
4096 void __kmp_affinity_process_placelist(kmp_affinity_t &affinity) {
4097  int i, j, count, stride, sign;
4098  kmp_affin_mask_t **out_masks = &affinity.masks;
4099  unsigned *out_numMasks = &affinity.num_masks;
4100  const char *placelist = affinity.proclist;
4101  kmp_affin_mask_t *osId2Mask = affinity.os_id_masks;
4102  int maxOsId = affinity.num_os_id_masks - 1;
4103  const char *scan = placelist;
4104  const char *next = placelist;
4105 
4106  numNewMasks = 2;
4107  KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
4108  nextNewMask = 0;
4109 
4110  // tempMask is modified based on the previous or initial
4111  // place to form the current place
4112  // previousMask contains the previous place
4113  kmp_affin_mask_t *tempMask;
4114  kmp_affin_mask_t *previousMask;
4115  KMP_CPU_ALLOC(tempMask);
4116  KMP_CPU_ZERO(tempMask);
4117  KMP_CPU_ALLOC(previousMask);
4118  KMP_CPU_ZERO(previousMask);
4119  int setSize = 0;
4120 
4121  for (;;) {
4122  __kmp_process_place(&scan, affinity, maxOsId, tempMask, &setSize);
4123 
4124  // valid follow sets are ',' ':' and EOL
4125  SKIP_WS(scan);
4126  if (*scan == '\0' || *scan == ',') {
4127  if (setSize > 0) {
4128  ADD_MASK(tempMask);
4129  }
4130  KMP_CPU_ZERO(tempMask);
4131  setSize = 0;
4132  if (*scan == '\0') {
4133  break;
4134  }
4135  scan++; // skip ','
4136  continue;
4137  }
4138 
4139  KMP_ASSERT2(*scan == ':', "bad explicit places list");
4140  scan++; // skip ':'
4141 
4142  // Read count parameter
4143  SKIP_WS(scan);
4144  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4145  next = scan;
4146  SKIP_DIGITS(next);
4147  count = __kmp_str_to_int(scan, *next);
4148  KMP_ASSERT(count >= 0);
4149  scan = next;
4150 
4151  // valid follow sets are ',' ':' and EOL
4152  SKIP_WS(scan);
4153  if (*scan == '\0' || *scan == ',') {
4154  stride = +1;
4155  } else {
4156  KMP_ASSERT2(*scan == ':', "bad explicit places list");
4157  scan++; // skip ':'
4158 
4159  // Read stride parameter
4160  sign = +1;
4161  for (;;) {
4162  SKIP_WS(scan);
4163  if (*scan == '+') {
4164  scan++; // skip '+'
4165  continue;
4166  }
4167  if (*scan == '-') {
4168  sign *= -1;
4169  scan++; // skip '-'
4170  continue;
4171  }
4172  break;
4173  }
4174  SKIP_WS(scan);
4175  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
4176  next = scan;
4177  SKIP_DIGITS(next);
4178  stride = __kmp_str_to_int(scan, *next);
4179  KMP_DEBUG_ASSERT(stride >= 0);
4180  scan = next;
4181  stride *= sign;
4182  }
4183 
4184  // Add places determined by initial_place : count : stride
4185  for (i = 0; i < count; i++) {
4186  if (setSize == 0) {
4187  break;
4188  }
4189  // Add the current place, then build the next place (tempMask) from that
4190  KMP_CPU_COPY(previousMask, tempMask);
4191  ADD_MASK(previousMask);
4192  KMP_CPU_ZERO(tempMask);
4193  setSize = 0;
4194  KMP_CPU_SET_ITERATE(j, previousMask) {
4195  if (!KMP_CPU_ISSET(j, previousMask)) {
4196  continue;
4197  }
4198  if ((j + stride > maxOsId) || (j + stride < 0) ||
4199  (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
4200  (!KMP_CPU_ISSET(j + stride,
4201  KMP_CPU_INDEX(osId2Mask, j + stride)))) {
4202  if (i < count - 1) {
4203  KMP_AFF_WARNING(affinity, AffIgnoreInvalidProcID, j + stride);
4204  }
4205  continue;
4206  }
4207  KMP_CPU_SET(j + stride, tempMask);
4208  setSize++;
4209  }
4210  }
4211  KMP_CPU_ZERO(tempMask);
4212  setSize = 0;
4213 
4214  // valid follow sets are ',' and EOL
4215  SKIP_WS(scan);
4216  if (*scan == '\0') {
4217  break;
4218  }
4219  if (*scan == ',') {
4220  scan++; // skip ','
4221  continue;
4222  }
4223 
4224  KMP_ASSERT2(0, "bad explicit places list");
4225  }
4226 
4227  *out_numMasks = nextNewMask;
4228  if (nextNewMask == 0) {
4229  *out_masks = NULL;
4230  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4231  return;
4232  }
4233  KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
4234  KMP_CPU_FREE(tempMask);
4235  KMP_CPU_FREE(previousMask);
4236  for (i = 0; i < nextNewMask; i++) {
4237  kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
4238  kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
4239  KMP_CPU_COPY(dest, src);
4240  }
4241  KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
4242 }
4243 
4244 #undef ADD_MASK
4245 #undef ADD_MASK_OSID
4246 
4247 // This function figures out the deepest level at which there is at least one
4248 // cluster/core with more than one processing unit bound to it.
4249 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) {
4250  int core_level = 0;
4251 
4252  for (int i = 0; i < nprocs; i++) {
4253  const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i);
4254  for (int j = bottom_level; j > 0; j--) {
4255  if (hw_thread.ids[j] > 0) {
4256  if (core_level < (j - 1)) {
4257  core_level = j - 1;
4258  }
4259  }
4260  }
4261  }
4262  return core_level;
4263 }
4264 
4265 // This function counts number of clusters/cores at given level.
4266 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level,
4267  int core_level) {
4268  return __kmp_topology->get_count(core_level);
4269 }
4270 // This function finds to which cluster/core given processing unit is bound.
4271 static int __kmp_affinity_find_core(int proc, int bottom_level,
4272  int core_level) {
4273  int core = 0;
4274  KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads());
4275  for (int i = 0; i <= proc; ++i) {
4276  if (i + 1 <= proc) {
4277  for (int j = 0; j <= core_level; ++j) {
4278  if (__kmp_topology->at(i + 1).sub_ids[j] !=
4279  __kmp_topology->at(i).sub_ids[j]) {
4280  core++;
4281  break;
4282  }
4283  }
4284  }
4285  }
4286  return core;
4287 }
4288 
4289 // This function finds maximal number of processing units bound to a
4290 // cluster/core at given level.
4291 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level,
4292  int core_level) {
4293  if (core_level >= bottom_level)
4294  return 1;
4295  int thread_level = __kmp_topology->get_level(KMP_HW_THREAD);
4296  return __kmp_topology->calculate_ratio(thread_level, core_level);
4297 }
4298 
4299 static int *procarr = NULL;
4300 static int __kmp_aff_depth = 0;
4301 static int *__kmp_osid_to_hwthread_map = NULL;
4302 
4303 static void __kmp_affinity_get_mask_topology_info(const kmp_affin_mask_t *mask,
4304  kmp_affinity_ids_t &ids,
4305  kmp_affinity_attrs_t &attrs) {
4306  if (!KMP_AFFINITY_CAPABLE())
4307  return;
4308 
4309  // Initiailze ids and attrs thread data
4310  for (int i = 0; i < KMP_HW_LAST; ++i)
4311  ids.ids[i] = kmp_hw_thread_t::UNKNOWN_ID;
4312  attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
4313 
4314  // Iterate through each os id within the mask and determine
4315  // the topology id and attribute information
4316  int cpu;
4317  int depth = __kmp_topology->get_depth();
4318  KMP_CPU_SET_ITERATE(cpu, mask) {
4319  int osid_idx = __kmp_osid_to_hwthread_map[cpu];
4320  ids.os_id = cpu;
4321  const kmp_hw_thread_t &hw_thread = __kmp_topology->at(osid_idx);
4322  for (int level = 0; level < depth; ++level) {
4323  kmp_hw_t type = __kmp_topology->get_type(level);
4324  int id = hw_thread.sub_ids[level];
4325  if (ids.ids[type] == kmp_hw_thread_t::UNKNOWN_ID || ids.ids[type] == id) {
4326  ids.ids[type] = id;
4327  } else {
4328  // This mask spans across multiple topology units, set it as such
4329  // and mark every level below as such as well.
4330  ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4331  for (; level < depth; ++level) {
4332  kmp_hw_t type = __kmp_topology->get_type(level);
4333  ids.ids[type] = kmp_hw_thread_t::MULTIPLE_ID;
4334  }
4335  }
4336  }
4337  if (!attrs.valid) {
4338  attrs.core_type = hw_thread.attrs.get_core_type();
4339  attrs.core_eff = hw_thread.attrs.get_core_eff();
4340  attrs.valid = 1;
4341  } else {
4342  // This mask spans across multiple attributes, set it as such
4343  if (attrs.core_type != hw_thread.attrs.get_core_type())
4344  attrs.core_type = KMP_HW_CORE_TYPE_UNKNOWN;
4345  if (attrs.core_eff != hw_thread.attrs.get_core_eff())
4346  attrs.core_eff = kmp_hw_attr_t::UNKNOWN_CORE_EFF;
4347  }
4348  }
4349 }
4350 
4351 static void __kmp_affinity_get_thread_topology_info(kmp_info_t *th) {
4352  if (!KMP_AFFINITY_CAPABLE())
4353  return;
4354  const kmp_affin_mask_t *mask = th->th.th_affin_mask;
4355  kmp_affinity_ids_t &ids = th->th.th_topology_ids;
4356  kmp_affinity_attrs_t &attrs = th->th.th_topology_attrs;
4357  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4358 }
4359 
4360 // Assign the topology information to each place in the place list
4361 // A thread can then grab not only its affinity mask, but the topology
4362 // information associated with that mask. e.g., Which socket is a thread on
4363 static void __kmp_affinity_get_topology_info(kmp_affinity_t &affinity) {
4364  if (!KMP_AFFINITY_CAPABLE())
4365  return;
4366  if (affinity.type != affinity_none) {
4367  KMP_ASSERT(affinity.num_os_id_masks);
4368  KMP_ASSERT(affinity.os_id_masks);
4369  }
4370  KMP_ASSERT(affinity.num_masks);
4371  KMP_ASSERT(affinity.masks);
4372  KMP_ASSERT(__kmp_affin_fullMask);
4373 
4374  int max_cpu = __kmp_affin_fullMask->get_max_cpu();
4375  int num_hw_threads = __kmp_topology->get_num_hw_threads();
4376 
4377  // Allocate thread topology information
4378  if (!affinity.ids) {
4379  affinity.ids = (kmp_affinity_ids_t *)__kmp_allocate(
4380  sizeof(kmp_affinity_ids_t) * affinity.num_masks);
4381  }
4382  if (!affinity.attrs) {
4383  affinity.attrs = (kmp_affinity_attrs_t *)__kmp_allocate(
4384  sizeof(kmp_affinity_attrs_t) * affinity.num_masks);
4385  }
4386  if (!__kmp_osid_to_hwthread_map) {
4387  // Want the +1 because max_cpu should be valid index into map
4388  __kmp_osid_to_hwthread_map =
4389  (int *)__kmp_allocate(sizeof(int) * (max_cpu + 1));
4390  }
4391 
4392  // Create the OS proc to hardware thread map
4393  for (int hw_thread = 0; hw_thread < num_hw_threads; ++hw_thread) {
4394  int os_id = __kmp_topology->at(hw_thread).os_id;
4395  if (KMP_CPU_ISSET(os_id, __kmp_affin_fullMask))
4396  __kmp_osid_to_hwthread_map[os_id] = hw_thread;
4397  }
4398 
4399  for (unsigned i = 0; i < affinity.num_masks; ++i) {
4400  kmp_affinity_ids_t &ids = affinity.ids[i];
4401  kmp_affinity_attrs_t &attrs = affinity.attrs[i];
4402  kmp_affin_mask_t *mask = KMP_CPU_INDEX(affinity.masks, i);
4403  __kmp_affinity_get_mask_topology_info(mask, ids, attrs);
4404  }
4405 }
4406 
4407 // Called when __kmp_topology is ready
4408 static void __kmp_aux_affinity_initialize_other_data(kmp_affinity_t &affinity) {
4409  // Initialize other data structures which depend on the topology
4410  if (__kmp_topology && __kmp_topology->get_num_hw_threads()) {
4411  machine_hierarchy.init(__kmp_topology->get_num_hw_threads());
4412  __kmp_affinity_get_topology_info(affinity);
4413 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
4414  __kmp_first_osid_with_ecore = __kmp_get_first_osid_with_ecore();
4415 #endif
4416  }
4417 }
4418 
4419 // Create a one element mask array (set of places) which only contains the
4420 // initial process's affinity mask
4421 static void __kmp_create_affinity_none_places(kmp_affinity_t &affinity) {
4422  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4423  KMP_ASSERT(affinity.type == affinity_none);
4424  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4425  affinity.num_masks = 1;
4426  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4427  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, 0);
4428  KMP_CPU_COPY(dest, __kmp_affin_fullMask);
4429  __kmp_aux_affinity_initialize_other_data(affinity);
4430 }
4431 
4432 static void __kmp_aux_affinity_initialize_masks(kmp_affinity_t &affinity) {
4433  // Create the "full" mask - this defines all of the processors that we
4434  // consider to be in the machine model. If respect is set, then it is the
4435  // initialization thread's affinity mask. Otherwise, it is all processors that
4436  // we know about on the machine.
4437  int verbose = affinity.flags.verbose;
4438  const char *env_var = affinity.env_var;
4439 
4440  // Already initialized
4441  if (__kmp_affin_fullMask && __kmp_affin_origMask)
4442  return;
4443 
4444  if (__kmp_affin_fullMask == NULL) {
4445  KMP_CPU_ALLOC(__kmp_affin_fullMask);
4446  }
4447  if (__kmp_affin_origMask == NULL) {
4448  KMP_CPU_ALLOC(__kmp_affin_origMask);
4449  }
4450  if (KMP_AFFINITY_CAPABLE()) {
4451  __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
4452  // Make a copy before possible expanding to the entire machine mask
4453  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4454  if (affinity.flags.respect) {
4455  // Count the number of available processors.
4456  unsigned i;
4457  __kmp_avail_proc = 0;
4458  KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
4459  if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
4460  continue;
4461  }
4462  __kmp_avail_proc++;
4463  }
4464  if (__kmp_avail_proc > __kmp_xproc) {
4465  KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4466  affinity.type = affinity_none;
4467  KMP_AFFINITY_DISABLE();
4468  return;
4469  }
4470 
4471  if (verbose) {
4472  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4473  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4474  __kmp_affin_fullMask);
4475  KMP_INFORM(InitOSProcSetRespect, env_var, buf);
4476  }
4477  } else {
4478  if (verbose) {
4479  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4480  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4481  __kmp_affin_fullMask);
4482  KMP_INFORM(InitOSProcSetNotRespect, env_var, buf);
4483  }
4484  __kmp_avail_proc =
4485  __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
4486 #if KMP_OS_WINDOWS
4487  if (__kmp_num_proc_groups <= 1) {
4488  // Copy expanded full mask if topology has single processor group
4489  __kmp_affin_origMask->copy(__kmp_affin_fullMask);
4490  }
4491  // Set the process affinity mask since threads' affinity
4492  // masks must be subset of process mask in Windows* OS
4493  __kmp_affin_fullMask->set_process_affinity(true);
4494 #endif
4495  }
4496  }
4497 }
4498 
4499 static bool __kmp_aux_affinity_initialize_topology(kmp_affinity_t &affinity) {
4500  bool success = false;
4501  const char *env_var = affinity.env_var;
4502  kmp_i18n_id_t msg_id = kmp_i18n_null;
4503  int verbose = affinity.flags.verbose;
4504 
4505  // For backward compatibility, setting KMP_CPUINFO_FILE =>
4506  // KMP_TOPOLOGY_METHOD=cpuinfo
4507  if ((__kmp_cpuinfo_file != NULL) &&
4508  (__kmp_affinity_top_method == affinity_top_method_all)) {
4509  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
4510  }
4511 
4512  if (__kmp_affinity_top_method == affinity_top_method_all) {
4513 // In the default code path, errors are not fatal - we just try using
4514 // another method. We only emit a warning message if affinity is on, or the
4515 // verbose flag is set, an the nowarnings flag was not set.
4516 #if KMP_USE_HWLOC
4517  if (!success &&
4518  __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4519  if (!__kmp_hwloc_error) {
4520  success = __kmp_affinity_create_hwloc_map(&msg_id);
4521  if (!success && verbose) {
4522  KMP_INFORM(AffIgnoringHwloc, env_var);
4523  }
4524  } else if (verbose) {
4525  KMP_INFORM(AffIgnoringHwloc, env_var);
4526  }
4527  }
4528 #endif
4529 
4530 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4531  if (!success) {
4532  success = __kmp_affinity_create_x2apicid_map(&msg_id);
4533  if (!success && verbose && msg_id != kmp_i18n_null) {
4534  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4535  }
4536  }
4537  if (!success) {
4538  success = __kmp_affinity_create_apicid_map(&msg_id);
4539  if (!success && verbose && msg_id != kmp_i18n_null) {
4540  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4541  }
4542  }
4543 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4544 
4545 #if KMP_OS_LINUX || KMP_OS_AIX
4546  if (!success) {
4547  int line = 0;
4548  success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4549  if (!success && verbose && msg_id != kmp_i18n_null) {
4550  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4551  }
4552  }
4553 #endif /* KMP_OS_LINUX */
4554 
4555 #if KMP_GROUP_AFFINITY
4556  if (!success && (__kmp_num_proc_groups > 1)) {
4557  success = __kmp_affinity_create_proc_group_map(&msg_id);
4558  if (!success && verbose && msg_id != kmp_i18n_null) {
4559  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4560  }
4561  }
4562 #endif /* KMP_GROUP_AFFINITY */
4563 
4564  if (!success) {
4565  success = __kmp_affinity_create_flat_map(&msg_id);
4566  if (!success && verbose && msg_id != kmp_i18n_null) {
4567  KMP_INFORM(AffInfoStr, env_var, __kmp_i18n_catgets(msg_id));
4568  }
4569  KMP_ASSERT(success);
4570  }
4571  }
4572 
4573 // If the user has specified that a paricular topology discovery method is to be
4574 // used, then we abort if that method fails. The exception is group affinity,
4575 // which might have been implicitly set.
4576 #if KMP_USE_HWLOC
4577  else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4578  KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4579  success = __kmp_affinity_create_hwloc_map(&msg_id);
4580  if (!success) {
4581  KMP_ASSERT(msg_id != kmp_i18n_null);
4582  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4583  }
4584  }
4585 #endif // KMP_USE_HWLOC
4586 
4587 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4588  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
4589  __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
4590  success = __kmp_affinity_create_x2apicid_map(&msg_id);
4591  if (!success) {
4592  KMP_ASSERT(msg_id != kmp_i18n_null);
4593  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4594  }
4595  } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4596  success = __kmp_affinity_create_apicid_map(&msg_id);
4597  if (!success) {
4598  KMP_ASSERT(msg_id != kmp_i18n_null);
4599  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4600  }
4601  }
4602 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4603 
4604  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4605  int line = 0;
4606  success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id);
4607  if (!success) {
4608  KMP_ASSERT(msg_id != kmp_i18n_null);
4609  const char *filename = __kmp_cpuinfo_get_filename();
4610  if (line > 0) {
4611  KMP_FATAL(FileLineMsgExiting, filename, line,
4612  __kmp_i18n_catgets(msg_id));
4613  } else {
4614  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4615  }
4616  }
4617  }
4618 
4619 #if KMP_GROUP_AFFINITY
4620  else if (__kmp_affinity_top_method == affinity_top_method_group) {
4621  success = __kmp_affinity_create_proc_group_map(&msg_id);
4622  KMP_ASSERT(success);
4623  if (!success) {
4624  KMP_ASSERT(msg_id != kmp_i18n_null);
4625  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4626  }
4627  }
4628 #endif /* KMP_GROUP_AFFINITY */
4629 
4630  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4631  success = __kmp_affinity_create_flat_map(&msg_id);
4632  // should not fail
4633  KMP_ASSERT(success);
4634  }
4635 
4636  // Early exit if topology could not be created
4637  if (!__kmp_topology) {
4638  if (KMP_AFFINITY_CAPABLE()) {
4639  KMP_AFF_WARNING(affinity, ErrorInitializeAffinity);
4640  }
4641  if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 &&
4642  __kmp_ncores > 0) {
4643  __kmp_topology = kmp_topology_t::allocate(0, 0, NULL);
4644  __kmp_topology->canonicalize(nPackages, nCoresPerPkg,
4645  __kmp_nThreadsPerCore, __kmp_ncores);
4646  if (verbose) {
4647  __kmp_topology->print(env_var);
4648  }
4649  }
4650  return false;
4651  }
4652 
4653  // Canonicalize, print (if requested), apply KMP_HW_SUBSET
4654  __kmp_topology->canonicalize();
4655  if (verbose)
4656  __kmp_topology->print(env_var);
4657  bool filtered = __kmp_topology->filter_hw_subset();
4658  if (filtered && verbose)
4659  __kmp_topology->print("KMP_HW_SUBSET");
4660  return success;
4661 }
4662 
4663 static void __kmp_aux_affinity_initialize(kmp_affinity_t &affinity) {
4664  bool is_regular_affinity = (&affinity == &__kmp_affinity);
4665  bool is_hidden_helper_affinity = (&affinity == &__kmp_hh_affinity);
4666  const char *env_var = __kmp_get_affinity_env_var(affinity);
4667 
4668  if (affinity.flags.initialized) {
4669  KMP_ASSERT(__kmp_affin_fullMask != NULL);
4670  return;
4671  }
4672 
4673  if (is_regular_affinity && (!__kmp_affin_fullMask || !__kmp_affin_origMask))
4674  __kmp_aux_affinity_initialize_masks(affinity);
4675 
4676  if (is_regular_affinity && !__kmp_topology) {
4677  bool success = __kmp_aux_affinity_initialize_topology(affinity);
4678  if (success) {
4679  KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads());
4680  } else {
4681  affinity.type = affinity_none;
4682  KMP_AFFINITY_DISABLE();
4683  }
4684  }
4685 
4686  // If KMP_AFFINITY=none, then only create the single "none" place
4687  // which is the process's initial affinity mask or the number of
4688  // hardware threads depending on respect,norespect
4689  if (affinity.type == affinity_none) {
4690  __kmp_create_affinity_none_places(affinity);
4691 #if KMP_USE_HIER_SCHED
4692  __kmp_dispatch_set_hierarchy_values();
4693 #endif
4694  affinity.flags.initialized = TRUE;
4695  return;
4696  }
4697 
4698  __kmp_topology->set_granularity(affinity);
4699  int depth = __kmp_topology->get_depth();
4700 
4701  // Create the table of masks, indexed by thread Id.
4702  unsigned numUnique;
4703  int numAddrs = __kmp_topology->get_num_hw_threads();
4704  // If OMP_PLACES=cores:<attribute> specified, then attempt
4705  // to make OS Id mask table using those attributes
4706  if (affinity.core_attr_gran.valid) {
4707  __kmp_create_os_id_masks(&numUnique, affinity, [&](int idx) {
4708  KMP_ASSERT(idx >= -1);
4709  for (int i = idx + 1; i < numAddrs; ++i)
4710  if (__kmp_topology->at(i).attrs.contains(affinity.core_attr_gran))
4711  return i;
4712  return numAddrs;
4713  });
4714  if (!affinity.os_id_masks) {
4715  const char *core_attribute;
4716  if (affinity.core_attr_gran.core_eff != kmp_hw_attr_t::UNKNOWN_CORE_EFF)
4717  core_attribute = "core_efficiency";
4718  else
4719  core_attribute = "core_type";
4720  KMP_AFF_WARNING(affinity, AffIgnoringNotAvailable, env_var,
4721  core_attribute,
4722  __kmp_hw_get_catalog_string(KMP_HW_CORE, /*plural=*/true))
4723  }
4724  }
4725  // If core attributes did not work, or none were specified,
4726  // then make OS Id mask table using typical incremental way.
4727  if (!affinity.os_id_masks) {
4728  __kmp_create_os_id_masks(&numUnique, affinity, [](int idx) {
4729  KMP_ASSERT(idx >= -1);
4730  return idx + 1;
4731  });
4732  }
4733  if (affinity.gran_levels == 0) {
4734  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4735  }
4736 
4737  switch (affinity.type) {
4738 
4739  case affinity_explicit:
4740  KMP_DEBUG_ASSERT(affinity.proclist != NULL);
4741  if (is_hidden_helper_affinity ||
4742  __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
4743  __kmp_affinity_process_proclist(affinity);
4744  } else {
4745  __kmp_affinity_process_placelist(affinity);
4746  }
4747  if (affinity.num_masks == 0) {
4748  KMP_AFF_WARNING(affinity, AffNoValidProcID);
4749  affinity.type = affinity_none;
4750  __kmp_create_affinity_none_places(affinity);
4751  affinity.flags.initialized = TRUE;
4752  return;
4753  }
4754  break;
4755 
4756  // The other affinity types rely on sorting the hardware threads according to
4757  // some permutation of the machine topology tree. Set affinity.compact
4758  // and affinity.offset appropriately, then jump to a common code
4759  // fragment to do the sort and create the array of affinity masks.
4760  case affinity_logical:
4761  affinity.compact = 0;
4762  if (affinity.offset) {
4763  affinity.offset =
4764  __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4765  }
4766  goto sortTopology;
4767 
4768  case affinity_physical:
4769  if (__kmp_nThreadsPerCore > 1) {
4770  affinity.compact = 1;
4771  if (affinity.compact >= depth) {
4772  affinity.compact = 0;
4773  }
4774  } else {
4775  affinity.compact = 0;
4776  }
4777  if (affinity.offset) {
4778  affinity.offset =
4779  __kmp_nThreadsPerCore * affinity.offset % __kmp_avail_proc;
4780  }
4781  goto sortTopology;
4782 
4783  case affinity_scatter:
4784  if (affinity.compact >= depth) {
4785  affinity.compact = 0;
4786  } else {
4787  affinity.compact = depth - 1 - affinity.compact;
4788  }
4789  goto sortTopology;
4790 
4791  case affinity_compact:
4792  if (affinity.compact >= depth) {
4793  affinity.compact = depth - 1;
4794  }
4795  goto sortTopology;
4796 
4797  case affinity_balanced:
4798  if (depth <= 1 || is_hidden_helper_affinity) {
4799  KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4800  affinity.type = affinity_none;
4801  __kmp_create_affinity_none_places(affinity);
4802  affinity.flags.initialized = TRUE;
4803  return;
4804  } else if (!__kmp_topology->is_uniform()) {
4805  // Save the depth for further usage
4806  __kmp_aff_depth = depth;
4807 
4808  int core_level =
4809  __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1);
4810  int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1,
4811  core_level);
4812  int maxprocpercore = __kmp_affinity_max_proc_per_core(
4813  __kmp_avail_proc, depth - 1, core_level);
4814 
4815  int nproc = ncores * maxprocpercore;
4816  if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4817  KMP_AFF_WARNING(affinity, AffBalancedNotAvail, env_var);
4818  affinity.type = affinity_none;
4819  __kmp_create_affinity_none_places(affinity);
4820  affinity.flags.initialized = TRUE;
4821  return;
4822  }
4823 
4824  procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4825  for (int i = 0; i < nproc; i++) {
4826  procarr[i] = -1;
4827  }
4828 
4829  int lastcore = -1;
4830  int inlastcore = 0;
4831  for (int i = 0; i < __kmp_avail_proc; i++) {
4832  int proc = __kmp_topology->at(i).os_id;
4833  int core = __kmp_affinity_find_core(i, depth - 1, core_level);
4834 
4835  if (core == lastcore) {
4836  inlastcore++;
4837  } else {
4838  inlastcore = 0;
4839  }
4840  lastcore = core;
4841 
4842  procarr[core * maxprocpercore + inlastcore] = proc;
4843  }
4844  }
4845  if (affinity.compact >= depth) {
4846  affinity.compact = depth - 1;
4847  }
4848 
4849  sortTopology:
4850  // Allocate the gtid->affinity mask table.
4851  if (affinity.flags.dups) {
4852  affinity.num_masks = __kmp_avail_proc;
4853  } else {
4854  affinity.num_masks = numUnique;
4855  }
4856 
4857  if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4858  (__kmp_affinity_num_places > 0) &&
4859  ((unsigned)__kmp_affinity_num_places < affinity.num_masks) &&
4860  !is_hidden_helper_affinity) {
4861  affinity.num_masks = __kmp_affinity_num_places;
4862  }
4863 
4864  KMP_CPU_ALLOC_ARRAY(affinity.masks, affinity.num_masks);
4865 
4866  // Sort the topology table according to the current setting of
4867  // affinity.compact, then fill out affinity.masks.
4868  __kmp_topology->sort_compact(affinity);
4869  {
4870  int i;
4871  unsigned j;
4872  int num_hw_threads = __kmp_topology->get_num_hw_threads();
4873  kmp_full_mask_modifier_t full_mask;
4874  for (i = 0, j = 0; i < num_hw_threads; i++) {
4875  if ((!affinity.flags.dups) && (!__kmp_topology->at(i).leader)) {
4876  continue;
4877  }
4878  int osId = __kmp_topology->at(i).os_id;
4879 
4880  kmp_affin_mask_t *src = KMP_CPU_INDEX(affinity.os_id_masks, osId);
4881  kmp_affin_mask_t *dest = KMP_CPU_INDEX(affinity.masks, j);
4882  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4883  KMP_CPU_COPY(dest, src);
4884  full_mask.include(src);
4885  if (++j >= affinity.num_masks) {
4886  break;
4887  }
4888  }
4889  KMP_DEBUG_ASSERT(j == affinity.num_masks);
4890  // See if the places list further restricts or changes the full mask
4891  if (full_mask.restrict_to_mask() && affinity.flags.verbose) {
4892  __kmp_topology->print(env_var);
4893  }
4894  }
4895  // Sort the topology back using ids
4896  __kmp_topology->sort_ids();
4897  break;
4898 
4899  default:
4900  KMP_ASSERT2(0, "Unexpected affinity setting");
4901  }
4902  __kmp_aux_affinity_initialize_other_data(affinity);
4903  affinity.flags.initialized = TRUE;
4904 }
4905 
4906 void __kmp_affinity_initialize(kmp_affinity_t &affinity) {
4907  // Much of the code above was written assuming that if a machine was not
4908  // affinity capable, then affinity type == affinity_none.
4909  // We now explicitly represent this as affinity type == affinity_disabled.
4910  // There are too many checks for affinity type == affinity_none in this code.
4911  // Instead of trying to change them all, check if
4912  // affinity type == affinity_disabled, and if so, slam it with affinity_none,
4913  // call the real initialization routine, then restore affinity type to
4914  // affinity_disabled.
4915  int disabled = (affinity.type == affinity_disabled);
4916  if (!KMP_AFFINITY_CAPABLE())
4917  KMP_ASSERT(disabled);
4918  if (disabled)
4919  affinity.type = affinity_none;
4920  __kmp_aux_affinity_initialize(affinity);
4921  if (disabled)
4922  affinity.type = affinity_disabled;
4923 }
4924 
4925 void __kmp_affinity_uninitialize(void) {
4926  for (kmp_affinity_t *affinity : __kmp_affinities) {
4927  if (affinity->masks != NULL)
4928  KMP_CPU_FREE_ARRAY(affinity->masks, affinity->num_masks);
4929  if (affinity->os_id_masks != NULL)
4930  KMP_CPU_FREE_ARRAY(affinity->os_id_masks, affinity->num_os_id_masks);
4931  if (affinity->proclist != NULL)
4932  __kmp_free(affinity->proclist);
4933  if (affinity->ids != NULL)
4934  __kmp_free(affinity->ids);
4935  if (affinity->attrs != NULL)
4936  __kmp_free(affinity->attrs);
4937  *affinity = KMP_AFFINITY_INIT(affinity->env_var);
4938  }
4939  if (__kmp_affin_origMask != NULL) {
4940  if (KMP_AFFINITY_CAPABLE()) {
4941 #if KMP_OS_AIX
4942  // Uninitialize by unbinding the thread.
4943  bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
4944 #else
4945  __kmp_set_system_affinity(__kmp_affin_origMask, FALSE);
4946 #endif
4947  }
4948  KMP_CPU_FREE(__kmp_affin_origMask);
4949  __kmp_affin_origMask = NULL;
4950  }
4951  __kmp_affinity_num_places = 0;
4952  if (procarr != NULL) {
4953  __kmp_free(procarr);
4954  procarr = NULL;
4955  }
4956  if (__kmp_osid_to_hwthread_map) {
4957  __kmp_free(__kmp_osid_to_hwthread_map);
4958  __kmp_osid_to_hwthread_map = NULL;
4959  }
4960 #if KMP_USE_HWLOC
4961  if (__kmp_hwloc_topology != NULL) {
4962  hwloc_topology_destroy(__kmp_hwloc_topology);
4963  __kmp_hwloc_topology = NULL;
4964  }
4965 #endif
4966  if (__kmp_hw_subset) {
4967  kmp_hw_subset_t::deallocate(__kmp_hw_subset);
4968  __kmp_hw_subset = nullptr;
4969  }
4970  if (__kmp_topology) {
4971  kmp_topology_t::deallocate(__kmp_topology);
4972  __kmp_topology = nullptr;
4973  }
4974  KMPAffinity::destroy_api();
4975 }
4976 
4977 static void __kmp_select_mask_by_gtid(int gtid, const kmp_affinity_t *affinity,
4978  int *place, kmp_affin_mask_t **mask) {
4979  int mask_idx;
4980  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
4981  if (is_hidden_helper)
4982  // The first gtid is the regular primary thread, the second gtid is the main
4983  // thread of hidden team which does not participate in task execution.
4984  mask_idx = gtid - 2;
4985  else
4986  mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid);
4987  KMP_DEBUG_ASSERT(affinity->num_masks > 0);
4988  *place = (mask_idx + affinity->offset) % affinity->num_masks;
4989  *mask = KMP_CPU_INDEX(affinity->masks, *place);
4990 }
4991 
4992 // This function initializes the per-thread data concerning affinity including
4993 // the mask and topology information
4994 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
4995 
4996  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4997 
4998  // Set the thread topology information to default of unknown
4999  for (int id = 0; id < KMP_HW_LAST; ++id)
5000  th->th.th_topology_ids.ids[id] = kmp_hw_thread_t::UNKNOWN_ID;
5001  th->th.th_topology_attrs = KMP_AFFINITY_ATTRS_UNKNOWN;
5002 
5003  if (!KMP_AFFINITY_CAPABLE()) {
5004  return;
5005  }
5006 
5007  if (th->th.th_affin_mask == NULL) {
5008  KMP_CPU_ALLOC(th->th.th_affin_mask);
5009  } else {
5010  KMP_CPU_ZERO(th->th.th_affin_mask);
5011  }
5012 
5013  // Copy the thread mask to the kmp_info_t structure. If
5014  // __kmp_affinity.type == affinity_none, copy the "full" mask, i.e.
5015  // one that has all of the OS proc ids set, or if
5016  // __kmp_affinity.flags.respect is set, then the full mask is the
5017  // same as the mask of the initialization thread.
5018  kmp_affin_mask_t *mask;
5019  int i;
5020  const kmp_affinity_t *affinity;
5021  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
5022 
5023  if (is_hidden_helper)
5024  affinity = &__kmp_hh_affinity;
5025  else
5026  affinity = &__kmp_affinity;
5027 
5028  if (KMP_AFFINITY_NON_PROC_BIND || is_hidden_helper) {
5029  if ((affinity->type == affinity_none) ||
5030  (affinity->type == affinity_balanced) ||
5031  KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
5032 #if KMP_GROUP_AFFINITY
5033  if (__kmp_num_proc_groups > 1) {
5034  return;
5035  }
5036 #endif
5037  KMP_ASSERT(__kmp_affin_fullMask != NULL);
5038  i = 0;
5039  mask = __kmp_affin_fullMask;
5040  } else {
5041  __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
5042  }
5043  } else {
5044  if (!isa_root || __kmp_nested_proc_bind.bind_types[0] == proc_bind_false) {
5045 #if KMP_GROUP_AFFINITY
5046  if (__kmp_num_proc_groups > 1) {
5047  return;
5048  }
5049 #endif
5050  KMP_ASSERT(__kmp_affin_fullMask != NULL);
5051  i = KMP_PLACE_ALL;
5052  mask = __kmp_affin_fullMask;
5053  } else {
5054  __kmp_select_mask_by_gtid(gtid, affinity, &i, &mask);
5055  }
5056  }
5057 
5058  th->th.th_current_place = i;
5059  if (isa_root && !is_hidden_helper) {
5060  th->th.th_new_place = i;
5061  th->th.th_first_place = 0;
5062  th->th.th_last_place = affinity->num_masks - 1;
5063  } else if (KMP_AFFINITY_NON_PROC_BIND) {
5064  // When using a Non-OMP_PROC_BIND affinity method,
5065  // set all threads' place-partition-var to the entire place list
5066  th->th.th_first_place = 0;
5067  th->th.th_last_place = affinity->num_masks - 1;
5068  }
5069  // Copy topology information associated with the place
5070  if (i >= 0) {
5071  th->th.th_topology_ids = __kmp_affinity.ids[i];
5072  th->th.th_topology_attrs = __kmp_affinity.attrs[i];
5073  }
5074 
5075  if (i == KMP_PLACE_ALL) {
5076  KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to all places\n",
5077  gtid));
5078  } else {
5079  KA_TRACE(100, ("__kmp_affinity_set_init_mask: setting T#%d to place %d\n",
5080  gtid, i));
5081  }
5082 
5083  KMP_CPU_COPY(th->th.th_affin_mask, mask);
5084 }
5085 
5086 void __kmp_affinity_bind_init_mask(int gtid) {
5087  if (!KMP_AFFINITY_CAPABLE()) {
5088  return;
5089  }
5090  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5091  const kmp_affinity_t *affinity;
5092  const char *env_var;
5093  bool is_hidden_helper = KMP_HIDDEN_HELPER_THREAD(gtid);
5094 
5095  if (is_hidden_helper)
5096  affinity = &__kmp_hh_affinity;
5097  else
5098  affinity = &__kmp_affinity;
5099  env_var = __kmp_get_affinity_env_var(*affinity, /*for_binding=*/true);
5100  /* to avoid duplicate printing (will be correctly printed on barrier) */
5101  if (affinity->flags.verbose && (affinity->type == affinity_none ||
5102  (th->th.th_current_place != KMP_PLACE_ALL &&
5103  affinity->type != affinity_balanced)) &&
5104  !KMP_HIDDEN_HELPER_MAIN_THREAD(gtid)) {
5105  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5106  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5107  th->th.th_affin_mask);
5108  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5109  gtid, buf);
5110  }
5111 
5112 #if KMP_OS_WINDOWS
5113  // On Windows* OS, the process affinity mask might have changed. If the user
5114  // didn't request affinity and this call fails, just continue silently.
5115  // See CQ171393.
5116  if (affinity->type == affinity_none) {
5117  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
5118  } else
5119 #endif
5120 #ifndef KMP_OS_AIX
5121  // Do not set the full mask as the init mask on AIX.
5122  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5123 #endif
5124 }
5125 
5126 void __kmp_affinity_bind_place(int gtid) {
5127  // Hidden helper threads should not be affected by OMP_PLACES/OMP_PROC_BIND
5128  if (!KMP_AFFINITY_CAPABLE() || KMP_HIDDEN_HELPER_THREAD(gtid)) {
5129  return;
5130  }
5131 
5132  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
5133 
5134  KA_TRACE(100, ("__kmp_affinity_bind_place: binding T#%d to place %d (current "
5135  "place = %d)\n",
5136  gtid, th->th.th_new_place, th->th.th_current_place));
5137 
5138  // Check that the new place is within this thread's partition.
5139  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5140  KMP_ASSERT(th->th.th_new_place >= 0);
5141  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity.num_masks);
5142  if (th->th.th_first_place <= th->th.th_last_place) {
5143  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
5144  (th->th.th_new_place <= th->th.th_last_place));
5145  } else {
5146  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
5147  (th->th.th_new_place >= th->th.th_last_place));
5148  }
5149 
5150  // Copy the thread mask to the kmp_info_t structure,
5151  // and set this thread's affinity.
5152  kmp_affin_mask_t *mask =
5153  KMP_CPU_INDEX(__kmp_affinity.masks, th->th.th_new_place);
5154  KMP_CPU_COPY(th->th.th_affin_mask, mask);
5155  th->th.th_current_place = th->th.th_new_place;
5156 
5157  if (__kmp_affinity.flags.verbose) {
5158  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5159  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5160  th->th.th_affin_mask);
5161  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
5162  __kmp_gettid(), gtid, buf);
5163  }
5164  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
5165 }
5166 
5167 int __kmp_aux_set_affinity(void **mask) {
5168  int gtid;
5169  kmp_info_t *th;
5170  int retval;
5171 
5172  if (!KMP_AFFINITY_CAPABLE()) {
5173  return -1;
5174  }
5175 
5176  gtid = __kmp_entry_gtid();
5177  KA_TRACE(
5178  1000, (""); {
5179  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5180  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5181  (kmp_affin_mask_t *)(*mask));
5182  __kmp_debug_printf(
5183  "kmp_set_affinity: setting affinity mask for thread %d = %s\n",
5184  gtid, buf);
5185  });
5186 
5187  if (__kmp_env_consistency_check) {
5188  if ((mask == NULL) || (*mask == NULL)) {
5189  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5190  } else {
5191  unsigned proc;
5192  int num_procs = 0;
5193 
5194  KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
5195  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5196  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5197  }
5198  if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
5199  continue;
5200  }
5201  num_procs++;
5202  }
5203  if (num_procs == 0) {
5204  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5205  }
5206 
5207 #if KMP_GROUP_AFFINITY
5208  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
5209  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
5210  }
5211 #endif /* KMP_GROUP_AFFINITY */
5212  }
5213  }
5214 
5215  th = __kmp_threads[gtid];
5216  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5217  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5218  if (retval == 0) {
5219  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
5220  }
5221 
5222  th->th.th_current_place = KMP_PLACE_UNDEFINED;
5223  th->th.th_new_place = KMP_PLACE_UNDEFINED;
5224  th->th.th_first_place = 0;
5225  th->th.th_last_place = __kmp_affinity.num_masks - 1;
5226 
5227  // Turn off 4.0 affinity for the current tread at this parallel level.
5228  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
5229 
5230  return retval;
5231 }
5232 
5233 int __kmp_aux_get_affinity(void **mask) {
5234  int gtid;
5235  int retval;
5236 #if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
5237  kmp_info_t *th;
5238 #endif
5239  if (!KMP_AFFINITY_CAPABLE()) {
5240  return -1;
5241  }
5242 
5243  gtid = __kmp_entry_gtid();
5244 #if KMP_OS_WINDOWS || KMP_OS_AIX || KMP_DEBUG
5245  th = __kmp_threads[gtid];
5246 #else
5247  (void)gtid; // unused variable
5248 #endif
5249  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
5250 
5251  KA_TRACE(
5252  1000, (""); {
5253  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5254  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5255  th->th.th_affin_mask);
5256  __kmp_printf(
5257  "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid,
5258  buf);
5259  });
5260 
5261  if (__kmp_env_consistency_check) {
5262  if ((mask == NULL) || (*mask == NULL)) {
5263  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
5264  }
5265  }
5266 
5267 #if !KMP_OS_WINDOWS && !KMP_OS_AIX
5268 
5269  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
5270  KA_TRACE(
5271  1000, (""); {
5272  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5273  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5274  (kmp_affin_mask_t *)(*mask));
5275  __kmp_printf(
5276  "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid,
5277  buf);
5278  });
5279  return retval;
5280 
5281 #else
5282  (void)retval;
5283 
5284  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
5285  return 0;
5286 
5287 #endif /* !KMP_OS_WINDOWS && !KMP_OS_AIX */
5288 }
5289 
5290 int __kmp_aux_get_affinity_max_proc() {
5291  if (!KMP_AFFINITY_CAPABLE()) {
5292  return 0;
5293  }
5294 #if KMP_GROUP_AFFINITY
5295  if (__kmp_num_proc_groups > 1) {
5296  return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
5297  }
5298 #endif
5299  return __kmp_xproc;
5300 }
5301 
5302 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
5303  if (!KMP_AFFINITY_CAPABLE()) {
5304  return -1;
5305  }
5306 
5307  KA_TRACE(
5308  1000, (""); {
5309  int gtid = __kmp_entry_gtid();
5310  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5311  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5312  (kmp_affin_mask_t *)(*mask));
5313  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
5314  "affinity mask for thread %d = %s\n",
5315  proc, gtid, buf);
5316  });
5317 
5318  if (__kmp_env_consistency_check) {
5319  if ((mask == NULL) || (*mask == NULL)) {
5320  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
5321  }
5322  }
5323 
5324  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5325  return -1;
5326  }
5327  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5328  return -2;
5329  }
5330 
5331  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
5332  return 0;
5333 }
5334 
5335 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
5336  if (!KMP_AFFINITY_CAPABLE()) {
5337  return -1;
5338  }
5339 
5340  KA_TRACE(
5341  1000, (""); {
5342  int gtid = __kmp_entry_gtid();
5343  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5344  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5345  (kmp_affin_mask_t *)(*mask));
5346  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5347  "affinity mask for thread %d = %s\n",
5348  proc, gtid, buf);
5349  });
5350 
5351  if (__kmp_env_consistency_check) {
5352  if ((mask == NULL) || (*mask == NULL)) {
5353  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
5354  }
5355  }
5356 
5357  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5358  return -1;
5359  }
5360  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5361  return -2;
5362  }
5363 
5364  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
5365  return 0;
5366 }
5367 
5368 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
5369  if (!KMP_AFFINITY_CAPABLE()) {
5370  return -1;
5371  }
5372 
5373  KA_TRACE(
5374  1000, (""); {
5375  int gtid = __kmp_entry_gtid();
5376  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5377  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5378  (kmp_affin_mask_t *)(*mask));
5379  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5380  "affinity mask for thread %d = %s\n",
5381  proc, gtid, buf);
5382  });
5383 
5384  if (__kmp_env_consistency_check) {
5385  if ((mask == NULL) || (*mask == NULL)) {
5386  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
5387  }
5388  }
5389 
5390  if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5391  return -1;
5392  }
5393  if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5394  return 0;
5395  }
5396 
5397  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
5398 }
5399 
5400 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
5401 // Returns first os proc id with ATOM core
5402 int __kmp_get_first_osid_with_ecore(void) {
5403  int low = 0;
5404  int high = __kmp_topology->get_num_hw_threads() - 1;
5405  int mid = 0;
5406  while (high - low > 1) {
5407  mid = (high + low) / 2;
5408  if (__kmp_topology->at(mid).attrs.get_core_type() ==
5409  KMP_HW_CORE_TYPE_CORE) {
5410  low = mid + 1;
5411  } else {
5412  high = mid;
5413  }
5414  }
5415  if (__kmp_topology->at(mid).attrs.get_core_type() == KMP_HW_CORE_TYPE_ATOM) {
5416  return mid;
5417  }
5418  return -1;
5419 }
5420 #endif
5421 
5422 // Dynamic affinity settings - Affinity balanced
5423 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
5424  KMP_DEBUG_ASSERT(th);
5425  bool fine_gran = true;
5426  int tid = th->th.th_info.ds.ds_tid;
5427  const char *env_var = "KMP_AFFINITY";
5428 
5429  // Do not perform balanced affinity for the hidden helper threads
5430  if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th)))
5431  return;
5432 
5433  switch (__kmp_affinity.gran) {
5434  case KMP_HW_THREAD:
5435  break;
5436  case KMP_HW_CORE:
5437  if (__kmp_nThreadsPerCore > 1) {
5438  fine_gran = false;
5439  }
5440  break;
5441  case KMP_HW_SOCKET:
5442  if (nCoresPerPkg > 1) {
5443  fine_gran = false;
5444  }
5445  break;
5446  default:
5447  fine_gran = false;
5448  }
5449 
5450  if (__kmp_topology->is_uniform()) {
5451  int coreID;
5452  int threadID;
5453  // Number of hyper threads per core in HT machine
5454  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
5455  // Number of cores
5456  int ncores = __kmp_ncores;
5457  if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
5458  __kmp_nth_per_core = __kmp_avail_proc / nPackages;
5459  ncores = nPackages;
5460  }
5461  // How many threads will be bound to each core
5462  int chunk = nthreads / ncores;
5463  // How many cores will have an additional thread bound to it - "big cores"
5464  int big_cores = nthreads % ncores;
5465  // Number of threads on the big cores
5466  int big_nth = (chunk + 1) * big_cores;
5467  if (tid < big_nth) {
5468  coreID = tid / (chunk + 1);
5469  threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
5470  } else { // tid >= big_nth
5471  coreID = (tid - big_cores) / chunk;
5472  threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5473  }
5474  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5475  "Illegal set affinity operation when not capable");
5476 
5477  kmp_affin_mask_t *mask = th->th.th_affin_mask;
5478  KMP_CPU_ZERO(mask);
5479 
5480  if (fine_gran) {
5481  int osID =
5482  __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id;
5483  KMP_CPU_SET(osID, mask);
5484  } else {
5485  for (int i = 0; i < __kmp_nth_per_core; i++) {
5486  int osID;
5487  osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id;
5488  KMP_CPU_SET(osID, mask);
5489  }
5490  }
5491  if (__kmp_affinity.flags.verbose) {
5492  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5493  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5494  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5495  tid, buf);
5496  }
5497  __kmp_affinity_get_thread_topology_info(th);
5498  __kmp_set_system_affinity(mask, TRUE);
5499  } else { // Non-uniform topology
5500 
5501  kmp_affin_mask_t *mask = th->th.th_affin_mask;
5502  KMP_CPU_ZERO(mask);
5503 
5504  int core_level =
5505  __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1);
5506  int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc,
5507  __kmp_aff_depth - 1, core_level);
5508  int nth_per_core = __kmp_affinity_max_proc_per_core(
5509  __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5510 
5511  // For performance gain consider the special case nthreads ==
5512  // __kmp_avail_proc
5513  if (nthreads == __kmp_avail_proc) {
5514  if (fine_gran) {
5515  int osID = __kmp_topology->at(tid).os_id;
5516  KMP_CPU_SET(osID, mask);
5517  } else {
5518  int core =
5519  __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level);
5520  for (int i = 0; i < __kmp_avail_proc; i++) {
5521  int osID = __kmp_topology->at(i).os_id;
5522  if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) ==
5523  core) {
5524  KMP_CPU_SET(osID, mask);
5525  }
5526  }
5527  }
5528  } else if (nthreads <= ncores) {
5529 
5530  int core = 0;
5531  for (int i = 0; i < ncores; i++) {
5532  // Check if this core from procarr[] is in the mask
5533  int in_mask = 0;
5534  for (int j = 0; j < nth_per_core; j++) {
5535  if (procarr[i * nth_per_core + j] != -1) {
5536  in_mask = 1;
5537  break;
5538  }
5539  }
5540  if (in_mask) {
5541  if (tid == core) {
5542  for (int j = 0; j < nth_per_core; j++) {
5543  int osID = procarr[i * nth_per_core + j];
5544  if (osID != -1) {
5545  KMP_CPU_SET(osID, mask);
5546  // For fine granularity it is enough to set the first available
5547  // osID for this core
5548  if (fine_gran) {
5549  break;
5550  }
5551  }
5552  }
5553  break;
5554  } else {
5555  core++;
5556  }
5557  }
5558  }
5559  } else { // nthreads > ncores
5560  // Array to save the number of processors at each core
5561  int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
5562  // Array to save the number of cores with "x" available processors;
5563  int *ncores_with_x_procs =
5564  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5565  // Array to save the number of cores with # procs from x to nth_per_core
5566  int *ncores_with_x_to_max_procs =
5567  (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5568 
5569  for (int i = 0; i <= nth_per_core; i++) {
5570  ncores_with_x_procs[i] = 0;
5571  ncores_with_x_to_max_procs[i] = 0;
5572  }
5573 
5574  for (int i = 0; i < ncores; i++) {
5575  int cnt = 0;
5576  for (int j = 0; j < nth_per_core; j++) {
5577  if (procarr[i * nth_per_core + j] != -1) {
5578  cnt++;
5579  }
5580  }
5581  nproc_at_core[i] = cnt;
5582  ncores_with_x_procs[cnt]++;
5583  }
5584 
5585  for (int i = 0; i <= nth_per_core; i++) {
5586  for (int j = i; j <= nth_per_core; j++) {
5587  ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
5588  }
5589  }
5590 
5591  // Max number of processors
5592  int nproc = nth_per_core * ncores;
5593  // An array to keep number of threads per each context
5594  int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
5595  for (int i = 0; i < nproc; i++) {
5596  newarr[i] = 0;
5597  }
5598 
5599  int nth = nthreads;
5600  int flag = 0;
5601  while (nth > 0) {
5602  for (int j = 1; j <= nth_per_core; j++) {
5603  int cnt = ncores_with_x_to_max_procs[j];
5604  for (int i = 0; i < ncores; i++) {
5605  // Skip the core with 0 processors
5606  if (nproc_at_core[i] == 0) {
5607  continue;
5608  }
5609  for (int k = 0; k < nth_per_core; k++) {
5610  if (procarr[i * nth_per_core + k] != -1) {
5611  if (newarr[i * nth_per_core + k] == 0) {
5612  newarr[i * nth_per_core + k] = 1;
5613  cnt--;
5614  nth--;
5615  break;
5616  } else {
5617  if (flag != 0) {
5618  newarr[i * nth_per_core + k]++;
5619  cnt--;
5620  nth--;
5621  break;
5622  }
5623  }
5624  }
5625  }
5626  if (cnt == 0 || nth == 0) {
5627  break;
5628  }
5629  }
5630  if (nth == 0) {
5631  break;
5632  }
5633  }
5634  flag = 1;
5635  }
5636  int sum = 0;
5637  for (int i = 0; i < nproc; i++) {
5638  sum += newarr[i];
5639  if (sum > tid) {
5640  if (fine_gran) {
5641  int osID = procarr[i];
5642  KMP_CPU_SET(osID, mask);
5643  } else {
5644  int coreID = i / nth_per_core;
5645  for (int ii = 0; ii < nth_per_core; ii++) {
5646  int osID = procarr[coreID * nth_per_core + ii];
5647  if (osID != -1) {
5648  KMP_CPU_SET(osID, mask);
5649  }
5650  }
5651  }
5652  break;
5653  }
5654  }
5655  __kmp_free(newarr);
5656  }
5657 
5658  if (__kmp_affinity.flags.verbose) {
5659  char buf[KMP_AFFIN_MASK_PRINT_LEN];
5660  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5661  KMP_INFORM(BoundToOSProcSet, env_var, (kmp_int32)getpid(), __kmp_gettid(),
5662  tid, buf);
5663  }
5664  __kmp_affinity_get_thread_topology_info(th);
5665  __kmp_set_system_affinity(mask, TRUE);
5666  }
5667 }
5668 
5669 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX
5670 // We don't need this entry for Windows because
5671 // there is GetProcessAffinityMask() api
5672 //
5673 // The intended usage is indicated by these steps:
5674 // 1) The user gets the current affinity mask
5675 // 2) Then sets the affinity by calling this function
5676 // 3) Error check the return value
5677 // 4) Use non-OpenMP parallelization
5678 // 5) Reset the affinity to what was stored in step 1)
5679 #ifdef __cplusplus
5680 extern "C"
5681 #endif
5682  int
5683  kmp_set_thread_affinity_mask_initial()
5684 // the function returns 0 on success,
5685 // -1 if we cannot bind thread
5686 // >0 (errno) if an error happened during binding
5687 {
5688  int gtid = __kmp_get_gtid();
5689  if (gtid < 0) {
5690  // Do not touch non-omp threads
5691  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5692  "non-omp thread, returning\n"));
5693  return -1;
5694  }
5695  if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5696  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5697  "affinity not initialized, returning\n"));
5698  return -1;
5699  }
5700  KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5701  "set full mask for thread %d\n",
5702  gtid));
5703  KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5704 #if KMP_OS_AIX
5705  return bindprocessor(BINDTHREAD, thread_self(), PROCESSOR_CLASS_ANY);
5706 #else
5707  return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5708 #endif
5709 }
5710 #endif
5711 
5712 #endif // KMP_AFFINITY_SUPPORTED
int try_open(const char *filename, const char *mode)
Definition: kmp.h:4711