LLVM OpenMP* Runtime Library
kmp_affinity.h
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24  class Mask : public KMPAffinity::Mask {
25  hwloc_cpuset_t mask;
26 
27  public:
28  Mask() {
29  mask = hwloc_bitmap_alloc();
30  this->zero();
31  }
32  ~Mask() { hwloc_bitmap_free(mask); }
33  void set(int i) override { hwloc_bitmap_set(mask, i); }
34  bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35  void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36  void zero() override { hwloc_bitmap_zero(mask); }
37  bool empty() const override { return hwloc_bitmap_iszero(mask); }
38  void copy(const KMPAffinity::Mask *src) override {
39  const Mask *convert = static_cast<const Mask *>(src);
40  hwloc_bitmap_copy(mask, convert->mask);
41  }
42  void bitwise_and(const KMPAffinity::Mask *rhs) override {
43  const Mask *convert = static_cast<const Mask *>(rhs);
44  hwloc_bitmap_and(mask, mask, convert->mask);
45  }
46  void bitwise_or(const KMPAffinity::Mask *rhs) override {
47  const Mask *convert = static_cast<const Mask *>(rhs);
48  hwloc_bitmap_or(mask, mask, convert->mask);
49  }
50  void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51  bool is_equal(const KMPAffinity::Mask *rhs) const override {
52  const Mask *convert = static_cast<const Mask *>(rhs);
53  return hwloc_bitmap_isequal(mask, convert->mask);
54  }
55  int begin() const override { return hwloc_bitmap_first(mask); }
56  int end() const override { return -1; }
57  int next(int previous) const override {
58  return hwloc_bitmap_next(mask, previous);
59  }
60  int get_system_affinity(bool abort_on_error) override {
61  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62  "Illegal get affinity operation when not capable");
63  long retval =
64  hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65  if (retval >= 0) {
66  return 0;
67  }
68  int error = errno;
69  if (abort_on_error) {
70  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71  KMP_ERR(error), __kmp_msg_null);
72  }
73  return error;
74  }
75  int set_system_affinity(bool abort_on_error) const override {
76  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77  "Illegal set affinity operation when not capable");
78  long retval =
79  hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80  if (retval >= 0) {
81  return 0;
82  }
83  int error = errno;
84  if (abort_on_error) {
85  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86  KMP_ERR(error), __kmp_msg_null);
87  }
88  return error;
89  }
90 #if KMP_OS_WINDOWS
91  int set_process_affinity(bool abort_on_error) const override {
92  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93  "Illegal set process affinity operation when not capable");
94  int error = 0;
95  const hwloc_topology_support *support =
96  hwloc_topology_get_support(__kmp_hwloc_topology);
97  if (support->cpubind->set_proc_cpubind) {
98  int retval;
99  retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100  HWLOC_CPUBIND_PROCESS);
101  if (retval >= 0)
102  return 0;
103  error = errno;
104  if (abort_on_error)
105  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106  KMP_ERR(error), __kmp_msg_null);
107  }
108  return error;
109  }
110 #endif
111  int get_proc_group() const override {
112  int group = -1;
113 #if KMP_OS_WINDOWS
114  if (__kmp_num_proc_groups == 1) {
115  return 1;
116  }
117  for (int i = 0; i < __kmp_num_proc_groups; i++) {
118  // On windows, the long type is always 32 bits
119  unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120  unsigned long second_32_bits =
121  hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122  if (first_32_bits == 0 && second_32_bits == 0) {
123  continue;
124  }
125  if (group >= 0) {
126  return -1;
127  }
128  group = i;
129  }
130 #endif /* KMP_OS_WINDOWS */
131  return group;
132  }
133  };
134  void determine_capable(const char *var) override {
135  const hwloc_topology_support *topology_support;
136  if (__kmp_hwloc_topology == NULL) {
137  if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138  __kmp_hwloc_error = TRUE;
139  if (__kmp_affinity.flags.verbose) {
140  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141  }
142  }
143  if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144  __kmp_hwloc_error = TRUE;
145  if (__kmp_affinity.flags.verbose) {
146  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147  }
148  }
149  }
150  topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151  // Is the system capable of setting/getting this thread's affinity?
152  // Also, is topology discovery possible? (pu indicates ability to discover
153  // processing units). And finally, were there no errors when calling any
154  // hwloc_* API functions?
155  if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156  topology_support->cpubind->get_thisthread_cpubind &&
157  topology_support->discovery->pu && !__kmp_hwloc_error) {
158  // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159  KMP_AFFINITY_ENABLE(TRUE);
160  } else {
161  // indicate that hwloc didn't work and disable affinity
162  __kmp_hwloc_error = TRUE;
163  KMP_AFFINITY_DISABLE();
164  }
165  }
166  void bind_thread(int which) override {
167  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168  "Illegal set affinity operation when not capable");
169  KMPAffinity::Mask *mask;
170  KMP_CPU_ALLOC_ON_STACK(mask);
171  KMP_CPU_ZERO(mask);
172  KMP_CPU_SET(which, mask);
173  __kmp_set_system_affinity(mask, TRUE);
174  KMP_CPU_FREE_FROM_STACK(mask);
175  }
176  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178  KMPAffinity::Mask *allocate_mask_array(int num) override {
179  return new Mask[num];
180  }
181  void deallocate_mask_array(KMPAffinity::Mask *array) override {
182  Mask *hwloc_array = static_cast<Mask *>(array);
183  delete[] hwloc_array;
184  }
185  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186  int index) override {
187  Mask *hwloc_array = static_cast<Mask *>(array);
188  return &(hwloc_array[index]);
189  }
190  api_type get_api_type() const override { return HWLOC; }
191 };
192 #endif /* KMP_USE_HWLOC */
193 
194 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX
195 #if KMP_OS_LINUX
196 /* On some of the older OS's that we build on, these constants aren't present
197  in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
198  all systems of the same arch where they are defined, and they cannot change.
199  stone forever. */
200 #include <sys/syscall.h>
201 #if KMP_ARCH_X86 || KMP_ARCH_ARM
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 241
204 #elif __NR_sched_setaffinity != 241
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 242
209 #elif __NR_sched_getaffinity != 242
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_AARCH64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 122
215 #elif __NR_sched_setaffinity != 122
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 123
220 #elif __NR_sched_getaffinity != 123
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_RISCV64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 122
226 #elif __NR_sched_setaffinity != 122
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 123
231 #elif __NR_sched_getaffinity != 123
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 #elif KMP_ARCH_X86_64
235 #ifndef __NR_sched_setaffinity
236 #define __NR_sched_setaffinity 203
237 #elif __NR_sched_setaffinity != 203
238 #error Wrong code for setaffinity system call.
239 #endif /* __NR_sched_setaffinity */
240 #ifndef __NR_sched_getaffinity
241 #define __NR_sched_getaffinity 204
242 #elif __NR_sched_getaffinity != 204
243 #error Wrong code for getaffinity system call.
244 #endif /* __NR_sched_getaffinity */
245 #elif KMP_ARCH_PPC64
246 #ifndef __NR_sched_setaffinity
247 #define __NR_sched_setaffinity 222
248 #elif __NR_sched_setaffinity != 222
249 #error Wrong code for setaffinity system call.
250 #endif /* __NR_sched_setaffinity */
251 #ifndef __NR_sched_getaffinity
252 #define __NR_sched_getaffinity 223
253 #elif __NR_sched_getaffinity != 223
254 #error Wrong code for getaffinity system call.
255 #endif /* __NR_sched_getaffinity */
256 #elif KMP_ARCH_MIPS
257 #ifndef __NR_sched_setaffinity
258 #define __NR_sched_setaffinity 4239
259 #elif __NR_sched_setaffinity != 4239
260 #error Wrong code for setaffinity system call.
261 #endif /* __NR_sched_setaffinity */
262 #ifndef __NR_sched_getaffinity
263 #define __NR_sched_getaffinity 4240
264 #elif __NR_sched_getaffinity != 4240
265 #error Wrong code for getaffinity system call.
266 #endif /* __NR_sched_getaffinity */
267 #elif KMP_ARCH_MIPS64
268 #ifndef __NR_sched_setaffinity
269 #define __NR_sched_setaffinity 5195
270 #elif __NR_sched_setaffinity != 5195
271 #error Wrong code for setaffinity system call.
272 #endif /* __NR_sched_setaffinity */
273 #ifndef __NR_sched_getaffinity
274 #define __NR_sched_getaffinity 5196
275 #elif __NR_sched_getaffinity != 5196
276 #error Wrong code for getaffinity system call.
277 #endif /* __NR_sched_getaffinity */
278 #elif KMP_ARCH_LOONGARCH64
279 #ifndef __NR_sched_setaffinity
280 #define __NR_sched_setaffinity 122
281 #elif __NR_sched_setaffinity != 122
282 #error Wrong code for setaffinity system call.
283 #endif /* __NR_sched_setaffinity */
284 #ifndef __NR_sched_getaffinity
285 #define __NR_sched_getaffinity 123
286 #elif __NR_sched_getaffinity != 123
287 #error Wrong code for getaffinity system call.
288 #endif /* __NR_sched_getaffinity */
289 #elif KMP_ARCH_RISCV64
290 #ifndef __NR_sched_setaffinity
291 #define __NR_sched_setaffinity 122
292 #elif __NR_sched_setaffinity != 122
293 #error Wrong code for setaffinity system call.
294 #endif /* __NR_sched_setaffinity */
295 #ifndef __NR_sched_getaffinity
296 #define __NR_sched_getaffinity 123
297 #elif __NR_sched_getaffinity != 123
298 #error Wrong code for getaffinity system call.
299 #endif /* __NR_sched_getaffinity */
300 #elif KMP_ARCH_VE
301 #ifndef __NR_sched_setaffinity
302 #define __NR_sched_setaffinity 203
303 #elif __NR_sched_setaffinity != 203
304 #error Wrong code for setaffinity system call.
305 #endif /* __NR_sched_setaffinity */
306 #ifndef __NR_sched_getaffinity
307 #define __NR_sched_getaffinity 204
308 #elif __NR_sched_getaffinity != 204
309 #error Wrong code for getaffinity system call.
310 #endif /* __NR_sched_getaffinity */
311 #elif KMP_ARCH_S390X
312 #ifndef __NR_sched_setaffinity
313 #define __NR_sched_setaffinity 239
314 #elif __NR_sched_setaffinity != 239
315 #error Wrong code for setaffinity system call.
316 #endif /* __NR_sched_setaffinity */
317 #ifndef __NR_sched_getaffinity
318 #define __NR_sched_getaffinity 240
319 #elif __NR_sched_getaffinity != 240
320 #error Wrong code for getaffinity system call.
321 #endif /* __NR_sched_getaffinity */
322 #else
323 #error Unknown or unsupported architecture
324 #endif /* KMP_ARCH_* */
325 #elif KMP_OS_FREEBSD
326 #include <pthread.h>
327 #include <pthread_np.h>
328 #elif KMP_OS_AIX
329 #include <sys/dr.h>
330 #include <sys/rset.h>
331 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
332 #endif
333 class KMPNativeAffinity : public KMPAffinity {
334  class Mask : public KMPAffinity::Mask {
335  typedef unsigned long mask_t;
336  typedef decltype(__kmp_affin_mask_size) mask_size_type;
337  static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
338  static const mask_t ONE = 1;
339  mask_size_type get_num_mask_types() const {
340  return __kmp_affin_mask_size / sizeof(mask_t);
341  }
342 
343  public:
344  mask_t *mask;
345  Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
346  ~Mask() {
347  if (mask)
348  __kmp_free(mask);
349  }
350  void set(int i) override {
351  mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
352  }
353  bool is_set(int i) const override {
354  return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
355  }
356  void clear(int i) override {
357  mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
358  }
359  void zero() override {
360  mask_size_type e = get_num_mask_types();
361  for (mask_size_type i = 0; i < e; ++i)
362  mask[i] = (mask_t)0;
363  }
364  bool empty() const override {
365  mask_size_type e = get_num_mask_types();
366  for (mask_size_type i = 0; i < e; ++i)
367  if (mask[i] != (mask_t)0)
368  return false;
369  return true;
370  }
371  void copy(const KMPAffinity::Mask *src) override {
372  const Mask *convert = static_cast<const Mask *>(src);
373  mask_size_type e = get_num_mask_types();
374  for (mask_size_type i = 0; i < e; ++i)
375  mask[i] = convert->mask[i];
376  }
377  void bitwise_and(const KMPAffinity::Mask *rhs) override {
378  const Mask *convert = static_cast<const Mask *>(rhs);
379  mask_size_type e = get_num_mask_types();
380  for (mask_size_type i = 0; i < e; ++i)
381  mask[i] &= convert->mask[i];
382  }
383  void bitwise_or(const KMPAffinity::Mask *rhs) override {
384  const Mask *convert = static_cast<const Mask *>(rhs);
385  mask_size_type e = get_num_mask_types();
386  for (mask_size_type i = 0; i < e; ++i)
387  mask[i] |= convert->mask[i];
388  }
389  void bitwise_not() override {
390  mask_size_type e = get_num_mask_types();
391  for (mask_size_type i = 0; i < e; ++i)
392  mask[i] = ~(mask[i]);
393  }
394  bool is_equal(const KMPAffinity::Mask *rhs) const override {
395  const Mask *convert = static_cast<const Mask *>(rhs);
396  mask_size_type e = get_num_mask_types();
397  for (mask_size_type i = 0; i < e; ++i)
398  if (mask[i] != convert->mask[i])
399  return false;
400  return true;
401  }
402  int begin() const override {
403  int retval = 0;
404  while (retval < end() && !is_set(retval))
405  ++retval;
406  return retval;
407  }
408  int end() const override {
409  int e;
410  __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
411  return e;
412  }
413  int next(int previous) const override {
414  int retval = previous + 1;
415  while (retval < end() && !is_set(retval))
416  ++retval;
417  return retval;
418  }
419 #if KMP_OS_AIX
420  // On AIX, we don't have a way to get CPU(s) a thread is bound to.
421  // This routine is only used to get the full mask.
422  int get_system_affinity(bool abort_on_error) override {
423  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
424  "Illegal get affinity operation when not capable");
425 
426  (void)abort_on_error;
427 
428  // Set the mask with all CPUs that are available.
429  for (int i = 0; i < __kmp_xproc; ++i)
430  KMP_CPU_SET(i, this);
431  return 0;
432  }
433  int set_system_affinity(bool abort_on_error) const override {
434  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
435 
436  "Illegal set affinity operation when not capable");
437 
438  int location;
439  int gtid = __kmp_entry_gtid();
440  int tid = thread_self();
441 
442  // Unbind the thread if it was bound to any processors before so that
443  // we can bind the thread to CPUs specified by the mask not others.
444  int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
445 
446  // On AIX, we can only bind to one instead of a set of CPUs with the
447  // bindprocessor() system call.
448  KMP_CPU_SET_ITERATE(location, this) {
449  if (KMP_CPU_ISSET(location, this)) {
450  retval = bindprocessor(BINDTHREAD, tid, location);
451  if (retval == -1 && errno == 1) {
452  rsid_t rsid;
453  rsethandle_t rsh;
454  // Put something in rsh to prevent compiler warning
455  // about uninitalized use
456  rsh = rs_alloc(RS_EMPTY);
457  rsid.at_pid = getpid();
458  if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
459  retval = ra_detachrset(R_PROCESS, rsid, 0);
460  retval = bindprocessor(BINDTHREAD, tid, location);
461  }
462  }
463  if (retval == 0) {
464  KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
465  "T#%d to cpu=%d.\n",
466  gtid, location));
467  continue;
468  }
469  int error = errno;
470  if (abort_on_error) {
471  __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
472  KMP_ERR(error), __kmp_msg_null);
473  KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
474  "T#%d to cpu=%d, errno=%d.\n",
475  gtid, location, error));
476  return error;
477  }
478  }
479  }
480  return 0;
481  }
482 #else // !KMP_OS_AIX
483  int get_system_affinity(bool abort_on_error) override {
484  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
485  "Illegal get affinity operation when not capable");
486 #if KMP_OS_LINUX
487  long retval =
488  syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
489 #elif KMP_OS_FREEBSD
490  int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
491  reinterpret_cast<cpuset_t *>(mask));
492  int retval = (r == 0 ? 0 : -1);
493 #endif
494  if (retval >= 0) {
495  return 0;
496  }
497  int error = errno;
498  if (abort_on_error) {
499  __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
500  KMP_ERR(error), __kmp_msg_null);
501  }
502  return error;
503  }
504  int set_system_affinity(bool abort_on_error) const override {
505  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
506  "Illegal set affinity operation when not capable");
507 #if KMP_OS_LINUX
508  long retval =
509  syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
510 #elif KMP_OS_FREEBSD
511  int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
512  reinterpret_cast<cpuset_t *>(mask));
513  int retval = (r == 0 ? 0 : -1);
514 #endif
515  if (retval >= 0) {
516  return 0;
517  }
518  int error = errno;
519  if (abort_on_error) {
520  __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
521  KMP_ERR(error), __kmp_msg_null);
522  }
523  return error;
524  }
525 #endif // KMP_OS_AIX
526  };
527  void determine_capable(const char *env_var) override {
528  __kmp_affinity_determine_capable(env_var);
529  }
530  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
531  KMPAffinity::Mask *allocate_mask() override {
532  KMPNativeAffinity::Mask *retval = new Mask();
533  return retval;
534  }
535  void deallocate_mask(KMPAffinity::Mask *m) override {
536  KMPNativeAffinity::Mask *native_mask =
537  static_cast<KMPNativeAffinity::Mask *>(m);
538  delete native_mask;
539  }
540  KMPAffinity::Mask *allocate_mask_array(int num) override {
541  return new Mask[num];
542  }
543  void deallocate_mask_array(KMPAffinity::Mask *array) override {
544  Mask *linux_array = static_cast<Mask *>(array);
545  delete[] linux_array;
546  }
547  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
548  int index) override {
549  Mask *linux_array = static_cast<Mask *>(array);
550  return &(linux_array[index]);
551  }
552  api_type get_api_type() const override { return NATIVE_OS; }
553 };
554 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX */
555 
556 #if KMP_OS_WINDOWS
557 class KMPNativeAffinity : public KMPAffinity {
558  class Mask : public KMPAffinity::Mask {
559  typedef ULONG_PTR mask_t;
560  static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
561  mask_t *mask;
562 
563  public:
564  Mask() {
565  mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
566  }
567  ~Mask() {
568  if (mask)
569  __kmp_free(mask);
570  }
571  void set(int i) override {
572  mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
573  }
574  bool is_set(int i) const override {
575  return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
576  }
577  void clear(int i) override {
578  mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
579  }
580  void zero() override {
581  for (int i = 0; i < __kmp_num_proc_groups; ++i)
582  mask[i] = 0;
583  }
584  bool empty() const override {
585  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
586  if (mask[i])
587  return false;
588  return true;
589  }
590  void copy(const KMPAffinity::Mask *src) override {
591  const Mask *convert = static_cast<const Mask *>(src);
592  for (int i = 0; i < __kmp_num_proc_groups; ++i)
593  mask[i] = convert->mask[i];
594  }
595  void bitwise_and(const KMPAffinity::Mask *rhs) override {
596  const Mask *convert = static_cast<const Mask *>(rhs);
597  for (int i = 0; i < __kmp_num_proc_groups; ++i)
598  mask[i] &= convert->mask[i];
599  }
600  void bitwise_or(const KMPAffinity::Mask *rhs) override {
601  const Mask *convert = static_cast<const Mask *>(rhs);
602  for (int i = 0; i < __kmp_num_proc_groups; ++i)
603  mask[i] |= convert->mask[i];
604  }
605  void bitwise_not() override {
606  for (int i = 0; i < __kmp_num_proc_groups; ++i)
607  mask[i] = ~(mask[i]);
608  }
609  bool is_equal(const KMPAffinity::Mask *rhs) const override {
610  const Mask *convert = static_cast<const Mask *>(rhs);
611  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
612  if (mask[i] != convert->mask[i])
613  return false;
614  return true;
615  }
616  int begin() const override {
617  int retval = 0;
618  while (retval < end() && !is_set(retval))
619  ++retval;
620  return retval;
621  }
622  int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
623  int next(int previous) const override {
624  int retval = previous + 1;
625  while (retval < end() && !is_set(retval))
626  ++retval;
627  return retval;
628  }
629  int set_process_affinity(bool abort_on_error) const override {
630  if (__kmp_num_proc_groups <= 1) {
631  if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
632  DWORD error = GetLastError();
633  if (abort_on_error) {
634  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
635  __kmp_msg_null);
636  }
637  return error;
638  }
639  }
640  return 0;
641  }
642  int set_system_affinity(bool abort_on_error) const override {
643  if (__kmp_num_proc_groups > 1) {
644  // Check for a valid mask.
645  GROUP_AFFINITY ga;
646  int group = get_proc_group();
647  if (group < 0) {
648  if (abort_on_error) {
649  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
650  }
651  return -1;
652  }
653  // Transform the bit vector into a GROUP_AFFINITY struct
654  // and make the system call to set affinity.
655  ga.Group = group;
656  ga.Mask = mask[group];
657  ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
658 
659  KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
660  if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
661  DWORD error = GetLastError();
662  if (abort_on_error) {
663  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
664  __kmp_msg_null);
665  }
666  return error;
667  }
668  } else {
669  if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
670  DWORD error = GetLastError();
671  if (abort_on_error) {
672  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
673  __kmp_msg_null);
674  }
675  return error;
676  }
677  }
678  return 0;
679  }
680  int get_system_affinity(bool abort_on_error) override {
681  if (__kmp_num_proc_groups > 1) {
682  this->zero();
683  GROUP_AFFINITY ga;
684  KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
685  if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
686  DWORD error = GetLastError();
687  if (abort_on_error) {
688  __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
689  KMP_ERR(error), __kmp_msg_null);
690  }
691  return error;
692  }
693  if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
694  (ga.Mask == 0)) {
695  return -1;
696  }
697  mask[ga.Group] = ga.Mask;
698  } else {
699  mask_t newMask, sysMask, retval;
700  if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
701  DWORD error = GetLastError();
702  if (abort_on_error) {
703  __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
704  KMP_ERR(error), __kmp_msg_null);
705  }
706  return error;
707  }
708  retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
709  if (!retval) {
710  DWORD error = GetLastError();
711  if (abort_on_error) {
712  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
713  KMP_ERR(error), __kmp_msg_null);
714  }
715  return error;
716  }
717  newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
718  if (!newMask) {
719  DWORD error = GetLastError();
720  if (abort_on_error) {
721  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
722  KMP_ERR(error), __kmp_msg_null);
723  }
724  }
725  *mask = retval;
726  }
727  return 0;
728  }
729  int get_proc_group() const override {
730  int group = -1;
731  if (__kmp_num_proc_groups == 1) {
732  return 1;
733  }
734  for (int i = 0; i < __kmp_num_proc_groups; i++) {
735  if (mask[i] == 0)
736  continue;
737  if (group >= 0)
738  return -1;
739  group = i;
740  }
741  return group;
742  }
743  };
744  void determine_capable(const char *env_var) override {
745  __kmp_affinity_determine_capable(env_var);
746  }
747  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
748  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
749  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
750  KMPAffinity::Mask *allocate_mask_array(int num) override {
751  return new Mask[num];
752  }
753  void deallocate_mask_array(KMPAffinity::Mask *array) override {
754  Mask *windows_array = static_cast<Mask *>(array);
755  delete[] windows_array;
756  }
757  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
758  int index) override {
759  Mask *windows_array = static_cast<Mask *>(array);
760  return &(windows_array[index]);
761  }
762  api_type get_api_type() const override { return NATIVE_OS; }
763 };
764 #endif /* KMP_OS_WINDOWS */
765 #endif /* KMP_AFFINITY_SUPPORTED */
766 
767 // Describe an attribute for a level in the machine topology
768 struct kmp_hw_attr_t {
769  int core_type : 8;
770  int core_eff : 8;
771  unsigned valid : 1;
772  unsigned reserved : 15;
773 
774  static const int UNKNOWN_CORE_EFF = -1;
775 
776  kmp_hw_attr_t()
777  : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
778  valid(0), reserved(0) {}
779  void set_core_type(kmp_hw_core_type_t type) {
780  valid = 1;
781  core_type = type;
782  }
783  void set_core_eff(int eff) {
784  valid = 1;
785  core_eff = eff;
786  }
787  kmp_hw_core_type_t get_core_type() const {
788  return (kmp_hw_core_type_t)core_type;
789  }
790  int get_core_eff() const { return core_eff; }
791  bool is_core_type_valid() const {
792  return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
793  }
794  bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
795  operator bool() const { return valid; }
796  void clear() {
797  core_type = KMP_HW_CORE_TYPE_UNKNOWN;
798  core_eff = UNKNOWN_CORE_EFF;
799  valid = 0;
800  }
801  bool contains(const kmp_hw_attr_t &other) const {
802  if (!valid && !other.valid)
803  return true;
804  if (valid && other.valid) {
805  if (other.is_core_type_valid()) {
806  if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
807  return false;
808  }
809  if (other.is_core_eff_valid()) {
810  if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
811  return false;
812  }
813  return true;
814  }
815  return false;
816  }
817 #if KMP_AFFINITY_SUPPORTED
818  bool contains(const kmp_affinity_attrs_t &attr) const {
819  if (!valid && !attr.valid)
820  return true;
821  if (valid && attr.valid) {
822  if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
823  return (is_core_type_valid() &&
824  (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
825  if (attr.core_eff != UNKNOWN_CORE_EFF)
826  return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
827  return true;
828  }
829  return false;
830  }
831 #endif // KMP_AFFINITY_SUPPORTED
832  bool operator==(const kmp_hw_attr_t &rhs) const {
833  return (rhs.valid == valid && rhs.core_eff == core_eff &&
834  rhs.core_type == core_type);
835  }
836  bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
837 };
838 
839 #if KMP_AFFINITY_SUPPORTED
840 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
841 #endif
842 
843 class kmp_hw_thread_t {
844 public:
845  static const int UNKNOWN_ID = -1;
846  static const int MULTIPLE_ID = -2;
847  static int compare_ids(const void *a, const void *b);
848  static int compare_compact(const void *a, const void *b);
849  int ids[KMP_HW_LAST];
850  int sub_ids[KMP_HW_LAST];
851  bool leader;
852  int os_id;
853  kmp_hw_attr_t attrs;
854 
855  void print() const;
856  void clear() {
857  for (int i = 0; i < (int)KMP_HW_LAST; ++i)
858  ids[i] = UNKNOWN_ID;
859  leader = false;
860  attrs.clear();
861  }
862 };
863 
864 class kmp_topology_t {
865 
866  struct flags_t {
867  int uniform : 1;
868  int reserved : 31;
869  };
870 
871  int depth;
872 
873  // The following arrays are all 'depth' long and have been
874  // allocated to hold up to KMP_HW_LAST number of objects if
875  // needed so layers can be added without reallocation of any array
876 
877  // Orderd array of the types in the topology
878  kmp_hw_t *types;
879 
880  // Keep quick topology ratios, for non-uniform topologies,
881  // this ratio holds the max number of itemAs per itemB
882  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
883  int *ratio;
884 
885  // Storage containing the absolute number of each topology layer
886  int *count;
887 
888  // The number of core efficiencies. This is only useful for hybrid
889  // topologies. Core efficiencies will range from 0 to num efficiencies - 1
890  int num_core_efficiencies;
891  int num_core_types;
892  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
893 
894  // The hardware threads array
895  // hw_threads is num_hw_threads long
896  // Each hw_thread's ids and sub_ids are depth deep
897  int num_hw_threads;
898  kmp_hw_thread_t *hw_threads;
899 
900  // Equivalence hash where the key is the hardware topology item
901  // and the value is the equivalent hardware topology type in the
902  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
903  // known equivalence for the topology type
904  kmp_hw_t equivalent[KMP_HW_LAST];
905 
906  // Flags describing the topology
907  flags_t flags;
908 
909  // Compact value used during sort_compact()
910  int compact;
911 
912  // Insert a new topology layer after allocation
913  void _insert_layer(kmp_hw_t type, const int *ids);
914 
915 #if KMP_GROUP_AFFINITY
916  // Insert topology information about Windows Processor groups
917  void _insert_windows_proc_groups();
918 #endif
919 
920  // Count each item & get the num x's per y
921  // e.g., get the number of cores and the number of threads per core
922  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
923  void _gather_enumeration_information();
924 
925  // Remove layers that don't add information to the topology.
926  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
927  void _remove_radix1_layers();
928 
929  // Find out if the topology is uniform
930  void _discover_uniformity();
931 
932  // Set all the sub_ids for each hardware thread
933  void _set_sub_ids();
934 
935  // Set global affinity variables describing the number of threads per
936  // core, the number of packages, the number of cores per package, and
937  // the number of cores.
938  void _set_globals();
939 
940  // Set the last level cache equivalent type
941  void _set_last_level_cache();
942 
943  // Return the number of cores with a particular attribute, 'attr'.
944  // If 'find_all' is true, then find all cores on the machine, otherwise find
945  // all cores per the layer 'above'
946  int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
947  bool find_all = false) const;
948 
949 public:
950  // Force use of allocate()/deallocate()
951  kmp_topology_t() = delete;
952  kmp_topology_t(const kmp_topology_t &t) = delete;
953  kmp_topology_t(kmp_topology_t &&t) = delete;
954  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
955  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
956 
957  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
958  static void deallocate(kmp_topology_t *);
959 
960  // Functions used in create_map() routines
961  kmp_hw_thread_t &at(int index) {
962  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
963  return hw_threads[index];
964  }
965  const kmp_hw_thread_t &at(int index) const {
966  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
967  return hw_threads[index];
968  }
969  int get_num_hw_threads() const { return num_hw_threads; }
970  void sort_ids() {
971  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
972  kmp_hw_thread_t::compare_ids);
973  }
974  // Check if the hardware ids are unique, if they are
975  // return true, otherwise return false
976  bool check_ids() const;
977 
978  // Function to call after the create_map() routine
979  void canonicalize();
980  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
981 
982 // Functions used after canonicalize() called
983 
984 #if KMP_AFFINITY_SUPPORTED
985  // Set the granularity for affinity settings
986  void set_granularity(kmp_affinity_t &stgs) const;
987  bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
988  bool restrict_to_mask(const kmp_affin_mask_t *mask);
989  bool filter_hw_subset();
990 #endif
991  bool is_uniform() const { return flags.uniform; }
992  // Tell whether a type is a valid type in the topology
993  // returns KMP_HW_UNKNOWN when there is no equivalent type
994  kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
995  if (type == KMP_HW_UNKNOWN)
996  return KMP_HW_UNKNOWN;
997  return equivalent[type];
998  }
999  // Set type1 = type2
1000  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1001  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1002  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1003  kmp_hw_t real_type2 = equivalent[type2];
1004  if (real_type2 == KMP_HW_UNKNOWN)
1005  real_type2 = type2;
1006  equivalent[type1] = real_type2;
1007  // This loop is required since any of the types may have been set to
1008  // be equivalent to type1. They all must be checked and reset to type2.
1009  KMP_FOREACH_HW_TYPE(type) {
1010  if (equivalent[type] == type1) {
1011  equivalent[type] = real_type2;
1012  }
1013  }
1014  }
1015  // Calculate number of types corresponding to level1
1016  // per types corresponding to level2 (e.g., number of threads per core)
1017  int calculate_ratio(int level1, int level2) const {
1018  KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1019  KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1020  int r = 1;
1021  for (int level = level1; level > level2; --level)
1022  r *= ratio[level];
1023  return r;
1024  }
1025  int get_ratio(int level) const {
1026  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1027  return ratio[level];
1028  }
1029  int get_depth() const { return depth; };
1030  kmp_hw_t get_type(int level) const {
1031  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1032  return types[level];
1033  }
1034  int get_level(kmp_hw_t type) const {
1035  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1036  int eq_type = equivalent[type];
1037  if (eq_type == KMP_HW_UNKNOWN)
1038  return -1;
1039  for (int i = 0; i < depth; ++i)
1040  if (types[i] == eq_type)
1041  return i;
1042  return -1;
1043  }
1044  int get_count(int level) const {
1045  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1046  return count[level];
1047  }
1048  // Return the total number of cores with attribute 'attr'
1049  int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1050  return _get_ncores_with_attr(attr, -1, true);
1051  }
1052  // Return the number of cores with attribute
1053  // 'attr' per topology level 'above'
1054  int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1055  return _get_ncores_with_attr(attr, above, false);
1056  }
1057 
1058 #if KMP_AFFINITY_SUPPORTED
1059  friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1060  void sort_compact(kmp_affinity_t &affinity) {
1061  compact = affinity.compact;
1062  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1063  kmp_hw_thread_t::compare_compact);
1064  }
1065 #endif
1066  void print(const char *env_var = "KMP_AFFINITY") const;
1067  void dump() const;
1068 };
1069 extern kmp_topology_t *__kmp_topology;
1070 
1071 class kmp_hw_subset_t {
1072  const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1073 
1074 public:
1075  // Describe a machine topology item in KMP_HW_SUBSET
1076  struct item_t {
1077  kmp_hw_t type;
1078  int num_attrs;
1079  int num[MAX_ATTRS];
1080  int offset[MAX_ATTRS];
1081  kmp_hw_attr_t attr[MAX_ATTRS];
1082  };
1083  // Put parenthesis around max to avoid accidental use of Windows max macro.
1084  const static int USE_ALL = (std::numeric_limits<int>::max)();
1085 
1086 private:
1087  int depth;
1088  int capacity;
1089  item_t *items;
1090  kmp_uint64 set;
1091  bool absolute;
1092  // The set must be able to handle up to KMP_HW_LAST number of layers
1093  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1094  // Sorting the KMP_HW_SUBSET items to follow topology order
1095  // All unknown topology types will be at the beginning of the subset
1096  static int hw_subset_compare(const void *i1, const void *i2) {
1097  kmp_hw_t type1 = ((const item_t *)i1)->type;
1098  kmp_hw_t type2 = ((const item_t *)i2)->type;
1099  int level1 = __kmp_topology->get_level(type1);
1100  int level2 = __kmp_topology->get_level(type2);
1101  return level1 - level2;
1102  }
1103 
1104 public:
1105  // Force use of allocate()/deallocate()
1106  kmp_hw_subset_t() = delete;
1107  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1108  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1109  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1110  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1111 
1112  static kmp_hw_subset_t *allocate() {
1113  int initial_capacity = 5;
1114  kmp_hw_subset_t *retval =
1115  (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1116  retval->depth = 0;
1117  retval->capacity = initial_capacity;
1118  retval->set = 0ull;
1119  retval->absolute = false;
1120  retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1121  return retval;
1122  }
1123  static void deallocate(kmp_hw_subset_t *subset) {
1124  __kmp_free(subset->items);
1125  __kmp_free(subset);
1126  }
1127  void set_absolute() { absolute = true; }
1128  bool is_absolute() const { return absolute; }
1129  void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1130  for (int i = 0; i < depth; ++i) {
1131  // Found an existing item for this layer type
1132  // Add the num, offset, and attr to this item
1133  if (items[i].type == type) {
1134  int idx = items[i].num_attrs++;
1135  if ((size_t)idx >= MAX_ATTRS)
1136  return;
1137  items[i].num[idx] = num;
1138  items[i].offset[idx] = offset;
1139  items[i].attr[idx] = attr;
1140  return;
1141  }
1142  }
1143  if (depth == capacity - 1) {
1144  capacity *= 2;
1145  item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1146  for (int i = 0; i < depth; ++i)
1147  new_items[i] = items[i];
1148  __kmp_free(items);
1149  items = new_items;
1150  }
1151  items[depth].num_attrs = 1;
1152  items[depth].type = type;
1153  items[depth].num[0] = num;
1154  items[depth].offset[0] = offset;
1155  items[depth].attr[0] = attr;
1156  depth++;
1157  set |= (1ull << type);
1158  }
1159  int get_depth() const { return depth; }
1160  const item_t &at(int index) const {
1161  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1162  return items[index];
1163  }
1164  item_t &at(int index) {
1165  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1166  return items[index];
1167  }
1168  void remove(int index) {
1169  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1170  set &= ~(1ull << items[index].type);
1171  for (int j = index + 1; j < depth; ++j) {
1172  items[j - 1] = items[j];
1173  }
1174  depth--;
1175  }
1176  void sort() {
1177  KMP_DEBUG_ASSERT(__kmp_topology);
1178  qsort(items, depth, sizeof(item_t), hw_subset_compare);
1179  }
1180  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1181  void dump() const {
1182  printf("**********************\n");
1183  printf("*** kmp_hw_subset: ***\n");
1184  printf("* depth: %d\n", depth);
1185  printf("* items:\n");
1186  for (int i = 0; i < depth; ++i) {
1187  printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1188  for (int j = 0; j < items[i].num_attrs; ++j) {
1189  printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1190  items[i].offset[j]);
1191  if (!items[i].attr[j]) {
1192  printf(" (none)\n");
1193  } else {
1194  printf(
1195  " core_type = %s, core_eff = %d\n",
1196  __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1197  items[i].attr[j].get_core_eff());
1198  }
1199  }
1200  }
1201  printf("* set: 0x%llx\n", set);
1202  printf("* absolute: %d\n", absolute);
1203  printf("**********************\n");
1204  }
1205 };
1206 extern kmp_hw_subset_t *__kmp_hw_subset;
1207 
1208 /* A structure for holding machine-specific hierarchy info to be computed once
1209  at init. This structure represents a mapping of threads to the actual machine
1210  hierarchy, or to our best guess at what the hierarchy might be, for the
1211  purpose of performing an efficient barrier. In the worst case, when there is
1212  no machine hierarchy information, it produces a tree suitable for a barrier,
1213  similar to the tree used in the hyper barrier. */
1214 class hierarchy_info {
1215 public:
1216  /* Good default values for number of leaves and branching factor, given no
1217  affinity information. Behaves a bit like hyper barrier. */
1218  static const kmp_uint32 maxLeaves = 4;
1219  static const kmp_uint32 minBranch = 4;
1225  kmp_uint32 maxLevels;
1226 
1231  kmp_uint32 depth;
1232  kmp_uint32 base_num_threads;
1233  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1234  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1235  // 2=initialization in progress
1236  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1237 
1242  kmp_uint32 *numPerLevel;
1243  kmp_uint32 *skipPerLevel;
1244 
1245  void deriveLevels() {
1246  int hier_depth = __kmp_topology->get_depth();
1247  for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1248  numPerLevel[level] = __kmp_topology->get_ratio(i);
1249  }
1250  }
1251 
1252  hierarchy_info()
1253  : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1254 
1255  void fini() {
1256  if (!uninitialized && numPerLevel) {
1257  __kmp_free(numPerLevel);
1258  numPerLevel = NULL;
1259  uninitialized = not_initialized;
1260  }
1261  }
1262 
1263  void init(int num_addrs) {
1264  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1265  &uninitialized, not_initialized, initializing);
1266  if (bool_result == 0) { // Wait for initialization
1267  while (TCR_1(uninitialized) != initialized)
1268  KMP_CPU_PAUSE();
1269  return;
1270  }
1271  KMP_DEBUG_ASSERT(bool_result == 1);
1272 
1273  /* Added explicit initialization of the data fields here to prevent usage of
1274  dirty value observed when static library is re-initialized multiple times
1275  (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1276  OpenMP). */
1277  depth = 1;
1278  resizing = 0;
1279  maxLevels = 7;
1280  numPerLevel =
1281  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1282  skipPerLevel = &(numPerLevel[maxLevels]);
1283  for (kmp_uint32 i = 0; i < maxLevels;
1284  ++i) { // init numPerLevel[*] to 1 item per level
1285  numPerLevel[i] = 1;
1286  skipPerLevel[i] = 1;
1287  }
1288 
1289  // Sort table by physical ID
1290  if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1291  deriveLevels();
1292  } else {
1293  numPerLevel[0] = maxLeaves;
1294  numPerLevel[1] = num_addrs / maxLeaves;
1295  if (num_addrs % maxLeaves)
1296  numPerLevel[1]++;
1297  }
1298 
1299  base_num_threads = num_addrs;
1300  for (int i = maxLevels - 1; i >= 0;
1301  --i) // count non-empty levels to get depth
1302  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1303  depth++;
1304 
1305  kmp_uint32 branch = minBranch;
1306  if (numPerLevel[0] == 1)
1307  branch = num_addrs / maxLeaves;
1308  if (branch < minBranch)
1309  branch = minBranch;
1310  for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1311  while (numPerLevel[d] > branch ||
1312  (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1313  if (numPerLevel[d] & 1)
1314  numPerLevel[d]++;
1315  numPerLevel[d] = numPerLevel[d] >> 1;
1316  if (numPerLevel[d + 1] == 1)
1317  depth++;
1318  numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1319  }
1320  if (numPerLevel[0] == 1) {
1321  branch = branch >> 1;
1322  if (branch < 4)
1323  branch = minBranch;
1324  }
1325  }
1326 
1327  for (kmp_uint32 i = 1; i < depth; ++i)
1328  skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1329  // Fill in hierarchy in the case of oversubscription
1330  for (kmp_uint32 i = depth; i < maxLevels; ++i)
1331  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1332 
1333  uninitialized = initialized; // One writer
1334  }
1335 
1336  // Resize the hierarchy if nproc changes to something larger than before
1337  void resize(kmp_uint32 nproc) {
1338  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1339  while (bool_result == 0) { // someone else is trying to resize
1340  KMP_CPU_PAUSE();
1341  if (nproc <= base_num_threads) // happy with other thread's resize
1342  return;
1343  else // try to resize
1344  bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1345  }
1346  KMP_DEBUG_ASSERT(bool_result != 0);
1347  if (nproc <= base_num_threads)
1348  return; // happy with other thread's resize
1349 
1350  // Calculate new maxLevels
1351  kmp_uint32 old_sz = skipPerLevel[depth - 1];
1352  kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1353  // First see if old maxLevels is enough to contain new size
1354  for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1355  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1356  numPerLevel[i - 1] *= 2;
1357  old_sz *= 2;
1358  depth++;
1359  }
1360  if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1361  while (nproc > old_sz) {
1362  old_sz *= 2;
1363  incs++;
1364  depth++;
1365  }
1366  maxLevels += incs;
1367 
1368  // Resize arrays
1369  kmp_uint32 *old_numPerLevel = numPerLevel;
1370  kmp_uint32 *old_skipPerLevel = skipPerLevel;
1371  numPerLevel = skipPerLevel = NULL;
1372  numPerLevel =
1373  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1374  skipPerLevel = &(numPerLevel[maxLevels]);
1375 
1376  // Copy old elements from old arrays
1377  for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1378  // init numPerLevel[*] to 1 item per level
1379  numPerLevel[i] = old_numPerLevel[i];
1380  skipPerLevel[i] = old_skipPerLevel[i];
1381  }
1382 
1383  // Init new elements in arrays to 1
1384  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1385  // init numPerLevel[*] to 1 item per level
1386  numPerLevel[i] = 1;
1387  skipPerLevel[i] = 1;
1388  }
1389 
1390  // Free old arrays
1391  __kmp_free(old_numPerLevel);
1392  }
1393 
1394  // Fill in oversubscription levels of hierarchy
1395  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1396  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1397 
1398  base_num_threads = nproc;
1399  resizing = 0; // One writer
1400  }
1401 };
1402 #endif // KMP_AFFINITY_H