LLVM OpenMP* Runtime Library
kmp_affinity.cpp
1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_io.h"
19 #include "kmp_str.h"
20 #include "kmp_wrapper_getpid.h"
21 
22 #if KMP_AFFINITY_SUPPORTED
23 
24 //
25 // Print the affinity mask to the character array in a pretty format.
26 //
27 char *
28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
29 {
30  KMP_ASSERT(buf_len >= 40);
31  char *scan = buf;
32  char *end = buf + buf_len - 1;
33 
34  //
35  // Find first element / check for empty set.
36  //
37  size_t i;
38  for (i = 0; i < KMP_CPU_SETSIZE; i++) {
39  if (KMP_CPU_ISSET(i, mask)) {
40  break;
41  }
42  }
43  if (i == KMP_CPU_SETSIZE) {
44  KMP_SNPRINTF(scan, buf_len, "{<empty>}");
45  while (*scan != '\0') scan++;
46  KMP_ASSERT(scan <= end);
47  return buf;
48  }
49 
50  KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i);
51  while (*scan != '\0') scan++;
52  i++;
53  for (; i < KMP_CPU_SETSIZE; i++) {
54  if (! KMP_CPU_ISSET(i, mask)) {
55  continue;
56  }
57 
58  //
59  // Check for buffer overflow. A string of the form ",<n>" will have
60  // at most 10 characters, plus we want to leave room to print ",...}"
61  // if the set is too large to print for a total of 15 characters.
62  // We already left room for '\0' in setting end.
63  //
64  if (end - scan < 15) {
65  break;
66  }
67  KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i);
68  while (*scan != '\0') scan++;
69  }
70  if (i < KMP_CPU_SETSIZE) {
71  KMP_SNPRINTF(scan, buf_len, ",...");
72  while (*scan != '\0') scan++;
73  }
74  KMP_SNPRINTF(scan, buf_len, "}");
75  while (*scan != '\0') scan++;
76  KMP_ASSERT(scan <= end);
77  return buf;
78 }
79 
80 
81 void
82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
83 {
84  KMP_CPU_ZERO(mask);
85 
86 # if KMP_GROUP_AFFINITY
87 
88  if (__kmp_num_proc_groups > 1) {
89  int group;
90  KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
91  for (group = 0; group < __kmp_num_proc_groups; group++) {
92  int i;
93  int num = __kmp_GetActiveProcessorCount(group);
94  for (i = 0; i < num; i++) {
95  KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
96  }
97  }
98  }
99  else
100 
101 # endif /* KMP_GROUP_AFFINITY */
102 
103  {
104  int proc;
105  for (proc = 0; proc < __kmp_xproc; proc++) {
106  KMP_CPU_SET(proc, mask);
107  }
108  }
109 }
110 
111 
112 //
113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
114 // functions.
115 //
116 // The icc codegen emits sections with extremely long names, of the form
117 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
119 // some sort of memory corruption or table overflow that is triggered by
120 // these long strings. I checked the latest version of the linker -
121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
122 // fixed.
123 //
124 // Unfortunately, my attempts to reproduce it in a smaller example have
125 // failed - I'm not sure what the prospects are of getting it fixed
126 // properly - but we need a reproducer smaller than all of libomp.
127 //
128 // Work around the problem by avoiding inline constructors in such builds.
129 // We do this for all platforms, not just Linux* OS - non-inline functions are
130 // more debuggable and provide better coverage into than inline functions.
131 // Use inline functions in shipping libs, for performance.
132 //
133 
134 # if !defined(KMP_DEBUG) && !defined(COVER)
135 
136 class Address {
137 public:
138  static const unsigned maxDepth = 32;
139  unsigned labels[maxDepth];
140  unsigned childNums[maxDepth];
141  unsigned depth;
142  unsigned leader;
143  Address(unsigned _depth)
144  : depth(_depth), leader(FALSE) {
145  }
146  Address &operator=(const Address &b) {
147  depth = b.depth;
148  for (unsigned i = 0; i < depth; i++) {
149  labels[i] = b.labels[i];
150  childNums[i] = b.childNums[i];
151  }
152  leader = FALSE;
153  return *this;
154  }
155  bool operator==(const Address &b) const {
156  if (depth != b.depth)
157  return false;
158  for (unsigned i = 0; i < depth; i++)
159  if(labels[i] != b.labels[i])
160  return false;
161  return true;
162  }
163  bool isClose(const Address &b, int level) const {
164  if (depth != b.depth)
165  return false;
166  if ((unsigned)level >= depth)
167  return true;
168  for (unsigned i = 0; i < (depth - level); i++)
169  if(labels[i] != b.labels[i])
170  return false;
171  return true;
172  }
173  bool operator!=(const Address &b) const {
174  return !operator==(b);
175  }
176 };
177 
178 class AddrUnsPair {
179 public:
180  Address first;
181  unsigned second;
182  AddrUnsPair(Address _first, unsigned _second)
183  : first(_first), second(_second) {
184  }
185  AddrUnsPair &operator=(const AddrUnsPair &b)
186  {
187  first = b.first;
188  second = b.second;
189  return *this;
190  }
191 };
192 
193 # else
194 
195 class Address {
196 public:
197  static const unsigned maxDepth = 32;
198  unsigned labels[maxDepth];
199  unsigned childNums[maxDepth];
200  unsigned depth;
201  unsigned leader;
202  Address(unsigned _depth);
203  Address &operator=(const Address &b);
204  bool operator==(const Address &b) const;
205  bool isClose(const Address &b, int level) const;
206  bool operator!=(const Address &b) const;
207 };
208 
209 Address::Address(unsigned _depth)
210 {
211  depth = _depth;
212  leader = FALSE;
213 }
214 
215 Address &Address::operator=(const Address &b) {
216  depth = b.depth;
217  for (unsigned i = 0; i < depth; i++) {
218  labels[i] = b.labels[i];
219  childNums[i] = b.childNums[i];
220  }
221  leader = FALSE;
222  return *this;
223 }
224 
225 bool Address::operator==(const Address &b) const {
226  if (depth != b.depth)
227  return false;
228  for (unsigned i = 0; i < depth; i++)
229  if(labels[i] != b.labels[i])
230  return false;
231  return true;
232 }
233 
234 bool Address::isClose(const Address &b, int level) const {
235  if (depth != b.depth)
236  return false;
237  if ((unsigned)level >= depth)
238  return true;
239  for (unsigned i = 0; i < (depth - level); i++)
240  if(labels[i] != b.labels[i])
241  return false;
242  return true;
243 }
244 
245 bool Address::operator!=(const Address &b) const {
246  return !operator==(b);
247 }
248 
249 class AddrUnsPair {
250 public:
251  Address first;
252  unsigned second;
253  AddrUnsPair(Address _first, unsigned _second);
254  AddrUnsPair &operator=(const AddrUnsPair &b);
255 };
256 
257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
258  : first(_first), second(_second)
259 {
260 }
261 
262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
263 {
264  first = b.first;
265  second = b.second;
266  return *this;
267 }
268 
269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */
270 
271 
272 static int
273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b)
274 {
275  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
276  ->first);
277  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
278  ->first);
279  unsigned depth = aa->depth;
280  unsigned i;
281  KMP_DEBUG_ASSERT(depth == bb->depth);
282  for (i = 0; i < depth; i++) {
283  if (aa->labels[i] < bb->labels[i]) return -1;
284  if (aa->labels[i] > bb->labels[i]) return 1;
285  }
286  return 0;
287 }
288 
289 
290 static int
291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
292 {
293  const Address *aa = (const Address *)&(((AddrUnsPair *)a)
294  ->first);
295  const Address *bb = (const Address *)&(((AddrUnsPair *)b)
296  ->first);
297  unsigned depth = aa->depth;
298  unsigned i;
299  KMP_DEBUG_ASSERT(depth == bb->depth);
300  KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
301  KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
302  for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
303  int j = depth - i - 1;
304  if (aa->childNums[j] < bb->childNums[j]) return -1;
305  if (aa->childNums[j] > bb->childNums[j]) return 1;
306  }
307  for (; i < depth; i++) {
308  int j = i - __kmp_affinity_compact;
309  if (aa->childNums[j] < bb->childNums[j]) return -1;
310  if (aa->childNums[j] > bb->childNums[j]) return 1;
311  }
312  return 0;
313 }
314 
320 class hierarchy_info {
321 public:
326  kmp_uint32 maxLevels;
327 
331  kmp_uint32 depth;
332  kmp_uint32 base_num_threads;
333  volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress
334  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
335 
339  kmp_uint32 *numPerLevel;
340  kmp_uint32 *skipPerLevel;
341 
342  void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
343  int hier_depth = adr2os[0].first.depth;
344  int level = 0;
345  for (int i=hier_depth-1; i>=0; --i) {
346  int max = -1;
347  for (int j=0; j<num_addrs; ++j) {
348  int next = adr2os[j].first.childNums[i];
349  if (next > max) max = next;
350  }
351  numPerLevel[level] = max+1;
352  ++level;
353  }
354  }
355 
356  hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {}
357 
358  // TO FIX: This destructor causes a segfault in the library at shutdown.
359  //~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
360 
361  void init(AddrUnsPair *adr2os, int num_addrs)
362  {
363  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2);
364  if (bool_result == 0) { // Wait for initialization
365  while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE();
366  return;
367  }
368  KMP_DEBUG_ASSERT(bool_result==1);
369 
370  /* Added explicit initialization of the data fields here to prevent usage of dirty value
371  observed when static library is re-initialized multiple times (e.g. when
372  non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
373  depth = 1;
374  resizing = 0;
375  maxLevels = 7;
376  numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
377  skipPerLevel = &(numPerLevel[maxLevels]);
378  for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
379  numPerLevel[i] = 1;
380  skipPerLevel[i] = 1;
381  }
382 
383  // Sort table by physical ID
384  if (adr2os) {
385  qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
386  deriveLevels(adr2os, num_addrs);
387  }
388  else {
389  numPerLevel[0] = 4;
390  numPerLevel[1] = num_addrs/4;
391  if (num_addrs%4) numPerLevel[1]++;
392  }
393 
394  base_num_threads = num_addrs;
395  for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
396  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
397  depth++;
398 
399  kmp_uint32 branch = 4;
400  if (numPerLevel[0] == 1) branch = num_addrs/4;
401  if (branch<4) branch=4;
402  for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
403  while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
404  if (numPerLevel[d] & 1) numPerLevel[d]++;
405  numPerLevel[d] = numPerLevel[d] >> 1;
406  if (numPerLevel[d+1] == 1) depth++;
407  numPerLevel[d+1] = numPerLevel[d+1] << 1;
408  }
409  if(numPerLevel[0] == 1) {
410  branch = branch >> 1;
411  if (branch<4) branch = 4;
412  }
413  }
414 
415  for (kmp_uint32 i=1; i<depth; ++i)
416  skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
417  // Fill in hierarchy in the case of oversubscription
418  for (kmp_uint32 i=depth; i<maxLevels; ++i)
419  skipPerLevel[i] = 2*skipPerLevel[i-1];
420 
421  uninitialized = 0; // One writer
422 
423  }
424 
425  void resize(kmp_uint32 nproc)
426  {
427  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
428  if (bool_result == 0) { // Someone else is resizing
429  while (TCR_1(resizing) != 0) KMP_CPU_PAUSE();
430  return;
431  }
432  KMP_DEBUG_ASSERT(bool_result!=0);
433  KMP_DEBUG_ASSERT(nproc > base_num_threads);
434 
435  // Calculate new max_levels
436  kmp_uint32 old_sz = skipPerLevel[depth-1];
437  kmp_uint32 incs = 0, old_maxLevels= maxLevels;
438  while (nproc > old_sz) {
439  old_sz *=2;
440  incs++;
441  }
442  maxLevels += incs;
443 
444  // Resize arrays
445  kmp_uint32 *old_numPerLevel = numPerLevel;
446  kmp_uint32 *old_skipPerLevel = skipPerLevel;
447  numPerLevel = skipPerLevel = NULL;
448  numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
449  skipPerLevel = &(numPerLevel[maxLevels]);
450 
451  // Copy old elements from old arrays
452  for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
453  numPerLevel[i] = old_numPerLevel[i];
454  skipPerLevel[i] = old_skipPerLevel[i];
455  }
456 
457  // Init new elements in arrays to 1
458  for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
459  numPerLevel[i] = 1;
460  skipPerLevel[i] = 1;
461  }
462 
463  // Free old arrays
464  __kmp_free(old_numPerLevel);
465 
466  // Fill in oversubscription levels of hierarchy
467  for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
468  skipPerLevel[i] = 2*skipPerLevel[i-1];
469 
470  base_num_threads = nproc;
471  resizing = 0; // One writer
472 
473  }
474 };
475 
476 static hierarchy_info machine_hierarchy;
477 
478 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
479  kmp_uint32 depth;
480  // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
481  if (TCR_1(machine_hierarchy.uninitialized))
482  machine_hierarchy.init(NULL, nproc);
483  // Adjust the hierarchy in case num threads exceeds original
484  if (nproc > machine_hierarchy.base_num_threads)
485  machine_hierarchy.resize(nproc);
486 
487  depth = machine_hierarchy.depth;
488  KMP_DEBUG_ASSERT(depth > 0);
489  // The loop below adjusts the depth in the case of a resize
490  while (nproc > machine_hierarchy.skipPerLevel[depth-1])
491  depth++;
492 
493  thr_bar->depth = depth;
494  thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
495  thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
496 }
497 
498 //
499 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
500 // called to renumber the labels from [0..n] and place them into the child_num
501 // vector of the address object. This is done in case the labels used for
502 // the children at one node of the hierarchy differ from those used for
503 // another node at the same level. Example: suppose the machine has 2 nodes
504 // with 2 packages each. The first node contains packages 601 and 602, and
505 // second node contains packages 603 and 604. If we try to sort the table
506 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
507 // because we are paying attention to the labels themselves, not the ordinal
508 // child numbers. By using the child numbers in the sort, the result is
509 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
510 //
511 static void
512 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
513  int numAddrs)
514 {
515  KMP_DEBUG_ASSERT(numAddrs > 0);
516  int depth = address2os->first.depth;
517  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
518  unsigned *lastLabel = (unsigned *)__kmp_allocate(depth
519  * sizeof(unsigned));
520  int labCt;
521  for (labCt = 0; labCt < depth; labCt++) {
522  address2os[0].first.childNums[labCt] = counts[labCt] = 0;
523  lastLabel[labCt] = address2os[0].first.labels[labCt];
524  }
525  int i;
526  for (i = 1; i < numAddrs; i++) {
527  for (labCt = 0; labCt < depth; labCt++) {
528  if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
529  int labCt2;
530  for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
531  counts[labCt2] = 0;
532  lastLabel[labCt2] = address2os[i].first.labels[labCt2];
533  }
534  counts[labCt]++;
535  lastLabel[labCt] = address2os[i].first.labels[labCt];
536  break;
537  }
538  }
539  for (labCt = 0; labCt < depth; labCt++) {
540  address2os[i].first.childNums[labCt] = counts[labCt];
541  }
542  for (; labCt < (int)Address::maxDepth; labCt++) {
543  address2os[i].first.childNums[labCt] = 0;
544  }
545  }
546 }
547 
548 
549 //
550 // All of the __kmp_affinity_create_*_map() routines should set
551 // __kmp_affinity_masks to a vector of affinity mask objects of length
552 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and
553 // return the number of levels in the machine topology tree (zero if
554 // __kmp_affinity_type == affinity_none).
555 //
556 // All of the __kmp_affinity_create_*_map() routines should set *fullMask
557 // to the affinity mask for the initialization thread. They need to save and
558 // restore the mask, and it could be needed later, so saving it is just an
559 // optimization to avoid calling kmp_get_system_affinity() again.
560 //
561 static kmp_affin_mask_t *fullMask = NULL;
562 
563 kmp_affin_mask_t *
564 __kmp_affinity_get_fullMask() { return fullMask; }
565 
566 
567 static int nCoresPerPkg, nPackages;
568 static int __kmp_nThreadsPerCore;
569 #ifndef KMP_DFLT_NTH_CORES
570 static int __kmp_ncores;
571 #endif
572 
573 //
574 // __kmp_affinity_uniform_topology() doesn't work when called from
575 // places which support arbitrarily many levels in the machine topology
576 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
577 // __kmp_affinity_create_x2apicid_map().
578 //
579 inline static bool
580 __kmp_affinity_uniform_topology()
581 {
582  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
583 }
584 
585 
586 //
587 // Print out the detailed machine topology map, i.e. the physical locations
588 // of each OS proc.
589 //
590 static void
591 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth,
592  int pkgLevel, int coreLevel, int threadLevel)
593 {
594  int proc;
595 
596  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
597  for (proc = 0; proc < len; proc++) {
598  int level;
599  kmp_str_buf_t buf;
600  __kmp_str_buf_init(&buf);
601  for (level = 0; level < depth; level++) {
602  if (level == threadLevel) {
603  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
604  }
605  else if (level == coreLevel) {
606  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
607  }
608  else if (level == pkgLevel) {
609  __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
610  }
611  else if (level > pkgLevel) {
612  __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
613  level - pkgLevel - 1);
614  }
615  else {
616  __kmp_str_buf_print(&buf, "L%d ", level);
617  }
618  __kmp_str_buf_print(&buf, "%d ",
619  address2os[proc].first.labels[level]);
620  }
621  KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
622  buf.str);
623  __kmp_str_buf_free(&buf);
624  }
625 }
626 
627 
628 //
629 // If we don't know how to retrieve the machine's processor topology, or
630 // encounter an error in doing so, this routine is called to form a "flat"
631 // mapping of os thread id's <-> processor id's.
632 //
633 static int
634 __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
635  kmp_i18n_id_t *const msg_id)
636 {
637  *address2os = NULL;
638  *msg_id = kmp_i18n_null;
639 
640  //
641  // Even if __kmp_affinity_type == affinity_none, this routine might still
642  // called to set __kmp_ncores, as well as
643  // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
644  //
645  if (! KMP_AFFINITY_CAPABLE()) {
646  KMP_ASSERT(__kmp_affinity_type == affinity_none);
647  __kmp_ncores = nPackages = __kmp_xproc;
648  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
649  if (__kmp_affinity_verbose) {
650  KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
651  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
652  KMP_INFORM(Uniform, "KMP_AFFINITY");
653  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
654  __kmp_nThreadsPerCore, __kmp_ncores);
655  }
656  return 0;
657  }
658 
659  //
660  // When affinity is off, this routine will still be called to set
661  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
662  // nCoresPerPkg, & nPackages. Make sure all these vars are set
663  // correctly, and return now if affinity is not enabled.
664  //
665  __kmp_ncores = nPackages = __kmp_avail_proc;
666  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
667  if (__kmp_affinity_verbose) {
668  char buf[KMP_AFFIN_MASK_PRINT_LEN];
669  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
670 
671  KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
672  if (__kmp_affinity_respect_mask) {
673  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
674  } else {
675  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
676  }
677  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
678  KMP_INFORM(Uniform, "KMP_AFFINITY");
679  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
680  __kmp_nThreadsPerCore, __kmp_ncores);
681  }
682  if (__kmp_affinity_type == affinity_none) {
683  return 0;
684  }
685 
686  //
687  // Contruct the data structure to be returned.
688  //
689  *address2os = (AddrUnsPair*)
690  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
691  int avail_ct = 0;
692  unsigned int i;
693  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
694  //
695  // Skip this proc if it is not included in the machine model.
696  //
697  if (! KMP_CPU_ISSET(i, fullMask)) {
698  continue;
699  }
700 
701  Address addr(1);
702  addr.labels[0] = i;
703  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
704  }
705  if (__kmp_affinity_verbose) {
706  KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
707  }
708 
709  if (__kmp_affinity_gran_levels < 0) {
710  //
711  // Only the package level is modeled in the machine topology map,
712  // so the #levels of granularity is either 0 or 1.
713  //
714  if (__kmp_affinity_gran > affinity_gran_package) {
715  __kmp_affinity_gran_levels = 1;
716  }
717  else {
718  __kmp_affinity_gran_levels = 0;
719  }
720  }
721  return 1;
722 }
723 
724 
725 # if KMP_GROUP_AFFINITY
726 
727 //
728 // If multiple Windows* OS processor groups exist, we can create a 2-level
729 // topology map with the groups at level 0 and the individual procs at
730 // level 1.
731 //
732 // This facilitates letting the threads float among all procs in a group,
733 // if granularity=group (the default when there are multiple groups).
734 //
735 static int
736 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
737  kmp_i18n_id_t *const msg_id)
738 {
739  *address2os = NULL;
740  *msg_id = kmp_i18n_null;
741 
742  //
743  // If we don't have multiple processor groups, return now.
744  // The flat mapping will be used.
745  //
746  if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) {
747  // FIXME set *msg_id
748  return -1;
749  }
750 
751  //
752  // Contruct the data structure to be returned.
753  //
754  *address2os = (AddrUnsPair*)
755  __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
756  int avail_ct = 0;
757  int i;
758  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
759  //
760  // Skip this proc if it is not included in the machine model.
761  //
762  if (! KMP_CPU_ISSET(i, fullMask)) {
763  continue;
764  }
765 
766  Address addr(2);
767  addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
768  addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
769  (*address2os)[avail_ct++] = AddrUnsPair(addr,i);
770 
771  if (__kmp_affinity_verbose) {
772  KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
773  addr.labels[1]);
774  }
775  }
776 
777  if (__kmp_affinity_gran_levels < 0) {
778  if (__kmp_affinity_gran == affinity_gran_group) {
779  __kmp_affinity_gran_levels = 1;
780  }
781  else if ((__kmp_affinity_gran == affinity_gran_fine)
782  || (__kmp_affinity_gran == affinity_gran_thread)) {
783  __kmp_affinity_gran_levels = 0;
784  }
785  else {
786  const char *gran_str = NULL;
787  if (__kmp_affinity_gran == affinity_gran_core) {
788  gran_str = "core";
789  }
790  else if (__kmp_affinity_gran == affinity_gran_package) {
791  gran_str = "package";
792  }
793  else if (__kmp_affinity_gran == affinity_gran_node) {
794  gran_str = "node";
795  }
796  else {
797  KMP_ASSERT(0);
798  }
799 
800  // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread"
801  __kmp_affinity_gran_levels = 0;
802  }
803  }
804  return 2;
805 }
806 
807 # endif /* KMP_GROUP_AFFINITY */
808 
809 
810 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
811 
812 static int
813 __kmp_cpuid_mask_width(int count) {
814  int r = 0;
815 
816  while((1<<r) < count)
817  ++r;
818  return r;
819 }
820 
821 
822 class apicThreadInfo {
823 public:
824  unsigned osId; // param to __kmp_affinity_bind_thread
825  unsigned apicId; // from cpuid after binding
826  unsigned maxCoresPerPkg; // ""
827  unsigned maxThreadsPerPkg; // ""
828  unsigned pkgId; // inferred from above values
829  unsigned coreId; // ""
830  unsigned threadId; // ""
831 };
832 
833 
834 static int
835 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b)
836 {
837  const apicThreadInfo *aa = (const apicThreadInfo *)a;
838  const apicThreadInfo *bb = (const apicThreadInfo *)b;
839  if (aa->osId < bb->osId) return -1;
840  if (aa->osId > bb->osId) return 1;
841  return 0;
842 }
843 
844 
845 static int
846 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b)
847 {
848  const apicThreadInfo *aa = (const apicThreadInfo *)a;
849  const apicThreadInfo *bb = (const apicThreadInfo *)b;
850  if (aa->pkgId < bb->pkgId) return -1;
851  if (aa->pkgId > bb->pkgId) return 1;
852  if (aa->coreId < bb->coreId) return -1;
853  if (aa->coreId > bb->coreId) return 1;
854  if (aa->threadId < bb->threadId) return -1;
855  if (aa->threadId > bb->threadId) return 1;
856  return 0;
857 }
858 
859 
860 //
861 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
862 // an algorithm which cycles through the available os threads, setting
863 // the current thread's affinity mask to that thread, and then retrieves
864 // the Apic Id for each thread context using the cpuid instruction.
865 //
866 static int
867 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
868  kmp_i18n_id_t *const msg_id)
869 {
870  kmp_cpuid buf;
871  int rc;
872  *address2os = NULL;
873  *msg_id = kmp_i18n_null;
874 
875  //
876  // Check if cpuid leaf 4 is supported.
877  //
878  __kmp_x86_cpuid(0, 0, &buf);
879  if (buf.eax < 4) {
880  *msg_id = kmp_i18n_str_NoLeaf4Support;
881  return -1;
882  }
883 
884  //
885  // The algorithm used starts by setting the affinity to each available
886  // thread and retrieving info from the cpuid instruction, so if we are
887  // not capable of calling __kmp_get_system_affinity() and
888  // _kmp_get_system_affinity(), then we need to do something else - use
889  // the defaults that we calculated from issuing cpuid without binding
890  // to each proc.
891  //
892  if (! KMP_AFFINITY_CAPABLE()) {
893  //
894  // Hack to try and infer the machine topology using only the data
895  // available from cpuid on the current thread, and __kmp_xproc.
896  //
897  KMP_ASSERT(__kmp_affinity_type == affinity_none);
898 
899  //
900  // Get an upper bound on the number of threads per package using
901  // cpuid(1).
902  //
903  // On some OS/chps combinations where HT is supported by the chip
904  // but is disabled, this value will be 2 on a single core chip.
905  // Usually, it will be 2 if HT is enabled and 1 if HT is disabled.
906  //
907  __kmp_x86_cpuid(1, 0, &buf);
908  int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
909  if (maxThreadsPerPkg == 0) {
910  maxThreadsPerPkg = 1;
911  }
912 
913  //
914  // The num cores per pkg comes from cpuid(4).
915  // 1 must be added to the encoded value.
916  //
917  // The author of cpu_count.cpp treated this only an upper bound
918  // on the number of cores, but I haven't seen any cases where it
919  // was greater than the actual number of cores, so we will treat
920  // it as exact in this block of code.
921  //
922  // First, we need to check if cpuid(4) is supported on this chip.
923  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
924  // has the value n or greater.
925  //
926  __kmp_x86_cpuid(0, 0, &buf);
927  if (buf.eax >= 4) {
928  __kmp_x86_cpuid(4, 0, &buf);
929  nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
930  }
931  else {
932  nCoresPerPkg = 1;
933  }
934 
935  //
936  // There is no way to reliably tell if HT is enabled without issuing
937  // the cpuid instruction from every thread, can correlating the cpuid
938  // info, so if the machine is not affinity capable, we assume that HT
939  // is off. We have seen quite a few machines where maxThreadsPerPkg
940  // is 2, yet the machine does not support HT.
941  //
942  // - Older OSes are usually found on machines with older chips, which
943  // do not support HT.
944  //
945  // - The performance penalty for mistakenly identifying a machine as
946  // HT when it isn't (which results in blocktime being incorrecly set
947  // to 0) is greater than the penalty when for mistakenly identifying
948  // a machine as being 1 thread/core when it is really HT enabled
949  // (which results in blocktime being incorrectly set to a positive
950  // value).
951  //
952  __kmp_ncores = __kmp_xproc;
953  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
954  __kmp_nThreadsPerCore = 1;
955  if (__kmp_affinity_verbose) {
956  KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
957  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
958  if (__kmp_affinity_uniform_topology()) {
959  KMP_INFORM(Uniform, "KMP_AFFINITY");
960  } else {
961  KMP_INFORM(NonUniform, "KMP_AFFINITY");
962  }
963  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
964  __kmp_nThreadsPerCore, __kmp_ncores);
965  }
966  return 0;
967  }
968 
969  //
970  //
971  // From here on, we can assume that it is safe to call
972  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
973  // even if __kmp_affinity_type = affinity_none.
974  //
975 
976  //
977  // Save the affinity mask for the current thread.
978  //
979  kmp_affin_mask_t *oldMask;
980  KMP_CPU_ALLOC(oldMask);
981  KMP_ASSERT(oldMask != NULL);
982  __kmp_get_system_affinity(oldMask, TRUE);
983 
984  //
985  // Run through each of the available contexts, binding the current thread
986  // to it, and obtaining the pertinent information using the cpuid instr.
987  //
988  // The relevant information is:
989  //
990  // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
991  // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
992  //
993  // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The
994  // value of this field determines the width of the core# + thread#
995  // fields in the Apic Id. It is also an upper bound on the number
996  // of threads per package, but it has been verified that situations
997  // happen were it is not exact. In particular, on certain OS/chip
998  // combinations where Intel(R) Hyper-Threading Technology is supported
999  // by the chip but has
1000  // been disabled, the value of this field will be 2 (for a single core
1001  // chip). On other OS/chip combinations supporting
1002  // Intel(R) Hyper-Threading Technology, the value of
1003  // this field will be 1 when Intel(R) Hyper-Threading Technology is
1004  // disabled and 2 when it is enabled.
1005  //
1006  // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The
1007  // value of this field (+1) determines the width of the core# field in
1008  // the Apic Id. The comments in "cpucount.cpp" say that this value is
1009  // an upper bound, but the IA-32 architecture manual says that it is
1010  // exactly the number of cores per package, and I haven't seen any
1011  // case where it wasn't.
1012  //
1013  // From this information, deduce the package Id, core Id, and thread Id,
1014  // and set the corresponding fields in the apicThreadInfo struct.
1015  //
1016  unsigned i;
1017  apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1018  __kmp_avail_proc * sizeof(apicThreadInfo));
1019  unsigned nApics = 0;
1020  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
1021  //
1022  // Skip this proc if it is not included in the machine model.
1023  //
1024  if (! KMP_CPU_ISSET(i, fullMask)) {
1025  continue;
1026  }
1027  KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1028 
1029  __kmp_affinity_bind_thread(i);
1030  threadInfo[nApics].osId = i;
1031 
1032  //
1033  // The apic id and max threads per pkg come from cpuid(1).
1034  //
1035  __kmp_x86_cpuid(1, 0, &buf);
1036  if (! (buf.edx >> 9) & 1) {
1037  __kmp_set_system_affinity(oldMask, TRUE);
1038  __kmp_free(threadInfo);
1039  KMP_CPU_FREE(oldMask);
1040  *msg_id = kmp_i18n_str_ApicNotPresent;
1041  return -1;
1042  }
1043  threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1044  threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1045  if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1046  threadInfo[nApics].maxThreadsPerPkg = 1;
1047  }
1048 
1049  //
1050  // Max cores per pkg comes from cpuid(4).
1051  // 1 must be added to the encoded value.
1052  //
1053  // First, we need to check if cpuid(4) is supported on this chip.
1054  // To see if cpuid(n) is supported, issue cpuid(0) and check if eax
1055  // has the value n or greater.
1056  //
1057  __kmp_x86_cpuid(0, 0, &buf);
1058  if (buf.eax >= 4) {
1059  __kmp_x86_cpuid(4, 0, &buf);
1060  threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1061  }
1062  else {
1063  threadInfo[nApics].maxCoresPerPkg = 1;
1064  }
1065 
1066  //
1067  // Infer the pkgId / coreId / threadId using only the info
1068  // obtained locally.
1069  //
1070  int widthCT = __kmp_cpuid_mask_width(
1071  threadInfo[nApics].maxThreadsPerPkg);
1072  threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1073 
1074  int widthC = __kmp_cpuid_mask_width(
1075  threadInfo[nApics].maxCoresPerPkg);
1076  int widthT = widthCT - widthC;
1077  if (widthT < 0) {
1078  //
1079  // I've never seen this one happen, but I suppose it could, if
1080  // the cpuid instruction on a chip was really screwed up.
1081  // Make sure to restore the affinity mask before the tail call.
1082  //
1083  __kmp_set_system_affinity(oldMask, TRUE);
1084  __kmp_free(threadInfo);
1085  KMP_CPU_FREE(oldMask);
1086  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1087  return -1;
1088  }
1089 
1090  int maskC = (1 << widthC) - 1;
1091  threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT)
1092  &maskC;
1093 
1094  int maskT = (1 << widthT) - 1;
1095  threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT;
1096 
1097  nApics++;
1098  }
1099 
1100  //
1101  // We've collected all the info we need.
1102  // Restore the old affinity mask for this thread.
1103  //
1104  __kmp_set_system_affinity(oldMask, TRUE);
1105 
1106  //
1107  // If there's only one thread context to bind to, form an Address object
1108  // with depth 1 and return immediately (or, if affinity is off, set
1109  // address2os to NULL and return).
1110  //
1111  // If it is configured to omit the package level when there is only a
1112  // single package, the logic at the end of this routine won't work if
1113  // there is only a single thread - it would try to form an Address
1114  // object with depth 0.
1115  //
1116  KMP_ASSERT(nApics > 0);
1117  if (nApics == 1) {
1118  __kmp_ncores = nPackages = 1;
1119  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1120  if (__kmp_affinity_verbose) {
1121  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1122  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1123 
1124  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1125  if (__kmp_affinity_respect_mask) {
1126  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1127  } else {
1128  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1129  }
1130  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1131  KMP_INFORM(Uniform, "KMP_AFFINITY");
1132  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1133  __kmp_nThreadsPerCore, __kmp_ncores);
1134  }
1135 
1136  if (__kmp_affinity_type == affinity_none) {
1137  __kmp_free(threadInfo);
1138  KMP_CPU_FREE(oldMask);
1139  return 0;
1140  }
1141 
1142  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
1143  Address addr(1);
1144  addr.labels[0] = threadInfo[0].pkgId;
1145  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1146 
1147  if (__kmp_affinity_gran_levels < 0) {
1148  __kmp_affinity_gran_levels = 0;
1149  }
1150 
1151  if (__kmp_affinity_verbose) {
1152  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1153  }
1154 
1155  __kmp_free(threadInfo);
1156  KMP_CPU_FREE(oldMask);
1157  return 1;
1158  }
1159 
1160  //
1161  // Sort the threadInfo table by physical Id.
1162  //
1163  qsort(threadInfo, nApics, sizeof(*threadInfo),
1164  __kmp_affinity_cmp_apicThreadInfo_phys_id);
1165 
1166  //
1167  // The table is now sorted by pkgId / coreId / threadId, but we really
1168  // don't know the radix of any of the fields. pkgId's may be sparsely
1169  // assigned among the chips on a system. Although coreId's are usually
1170  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
1171  // [0..threadsPerCore-1], we don't want to make any such assumptions.
1172  //
1173  // For that matter, we don't know what coresPerPkg and threadsPerCore
1174  // (or the total # packages) are at this point - we want to determine
1175  // that now. We only have an upper bound on the first two figures.
1176  //
1177  // We also perform a consistency check at this point: the values returned
1178  // by the cpuid instruction for any thread bound to a given package had
1179  // better return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1180  //
1181  nPackages = 1;
1182  nCoresPerPkg = 1;
1183  __kmp_nThreadsPerCore = 1;
1184  unsigned nCores = 1;
1185 
1186  unsigned pkgCt = 1; // to determine radii
1187  unsigned lastPkgId = threadInfo[0].pkgId;
1188  unsigned coreCt = 1;
1189  unsigned lastCoreId = threadInfo[0].coreId;
1190  unsigned threadCt = 1;
1191  unsigned lastThreadId = threadInfo[0].threadId;
1192 
1193  // intra-pkg consist checks
1194  unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1195  unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1196 
1197  for (i = 1; i < nApics; i++) {
1198  if (threadInfo[i].pkgId != lastPkgId) {
1199  nCores++;
1200  pkgCt++;
1201  lastPkgId = threadInfo[i].pkgId;
1202  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1203  coreCt = 1;
1204  lastCoreId = threadInfo[i].coreId;
1205  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1206  threadCt = 1;
1207  lastThreadId = threadInfo[i].threadId;
1208 
1209  //
1210  // This is a different package, so go on to the next iteration
1211  // without doing any consistency checks. Reset the consistency
1212  // check vars, though.
1213  //
1214  prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1215  prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1216  continue;
1217  }
1218 
1219  if (threadInfo[i].coreId != lastCoreId) {
1220  nCores++;
1221  coreCt++;
1222  lastCoreId = threadInfo[i].coreId;
1223  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1224  threadCt = 1;
1225  lastThreadId = threadInfo[i].threadId;
1226  }
1227  else if (threadInfo[i].threadId != lastThreadId) {
1228  threadCt++;
1229  lastThreadId = threadInfo[i].threadId;
1230  }
1231  else {
1232  __kmp_free(threadInfo);
1233  KMP_CPU_FREE(oldMask);
1234  *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1235  return -1;
1236  }
1237 
1238  //
1239  // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1240  // fields agree between all the threads bounds to a given package.
1241  //
1242  if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg)
1243  || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1244  __kmp_free(threadInfo);
1245  KMP_CPU_FREE(oldMask);
1246  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1247  return -1;
1248  }
1249  }
1250  nPackages = pkgCt;
1251  if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt;
1252  if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt;
1253 
1254  //
1255  // When affinity is off, this routine will still be called to set
1256  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1257  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1258  // correctly, and return now if affinity is not enabled.
1259  //
1260  __kmp_ncores = nCores;
1261  if (__kmp_affinity_verbose) {
1262  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1263  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1264 
1265  KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1266  if (__kmp_affinity_respect_mask) {
1267  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1268  } else {
1269  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1270  }
1271  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1272  if (__kmp_affinity_uniform_topology()) {
1273  KMP_INFORM(Uniform, "KMP_AFFINITY");
1274  } else {
1275  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1276  }
1277  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1278  __kmp_nThreadsPerCore, __kmp_ncores);
1279 
1280  }
1281 
1282  if (__kmp_affinity_type == affinity_none) {
1283  __kmp_free(threadInfo);
1284  KMP_CPU_FREE(oldMask);
1285  return 0;
1286  }
1287 
1288  //
1289  // Now that we've determined the number of packages, the number of cores
1290  // per package, and the number of threads per core, we can construct the
1291  // data structure that is to be returned.
1292  //
1293  int pkgLevel = 0;
1294  int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1295  int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1296  unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1297 
1298  KMP_ASSERT(depth > 0);
1299  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1300 
1301  for (i = 0; i < nApics; ++i) {
1302  Address addr(depth);
1303  unsigned os = threadInfo[i].osId;
1304  int d = 0;
1305 
1306  if (pkgLevel >= 0) {
1307  addr.labels[d++] = threadInfo[i].pkgId;
1308  }
1309  if (coreLevel >= 0) {
1310  addr.labels[d++] = threadInfo[i].coreId;
1311  }
1312  if (threadLevel >= 0) {
1313  addr.labels[d++] = threadInfo[i].threadId;
1314  }
1315  (*address2os)[i] = AddrUnsPair(addr, os);
1316  }
1317 
1318  if (__kmp_affinity_gran_levels < 0) {
1319  //
1320  // Set the granularity level based on what levels are modeled
1321  // in the machine topology map.
1322  //
1323  __kmp_affinity_gran_levels = 0;
1324  if ((threadLevel >= 0)
1325  && (__kmp_affinity_gran > affinity_gran_thread)) {
1326  __kmp_affinity_gran_levels++;
1327  }
1328  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1329  __kmp_affinity_gran_levels++;
1330  }
1331  if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1332  __kmp_affinity_gran_levels++;
1333  }
1334  }
1335 
1336  if (__kmp_affinity_verbose) {
1337  __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1338  coreLevel, threadLevel);
1339  }
1340 
1341  __kmp_free(threadInfo);
1342  KMP_CPU_FREE(oldMask);
1343  return depth;
1344 }
1345 
1346 
1347 //
1348 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1349 // architectures support a newer interface for specifying the x2APIC Ids,
1350 // based on cpuid leaf 11.
1351 //
1352 static int
1353 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1354  kmp_i18n_id_t *const msg_id)
1355 {
1356  kmp_cpuid buf;
1357 
1358  *address2os = NULL;
1359  *msg_id = kmp_i18n_null;
1360 
1361  //
1362  // Check to see if cpuid leaf 11 is supported.
1363  //
1364  __kmp_x86_cpuid(0, 0, &buf);
1365  if (buf.eax < 11) {
1366  *msg_id = kmp_i18n_str_NoLeaf11Support;
1367  return -1;
1368  }
1369  __kmp_x86_cpuid(11, 0, &buf);
1370  if (buf.ebx == 0) {
1371  *msg_id = kmp_i18n_str_NoLeaf11Support;
1372  return -1;
1373  }
1374 
1375  //
1376  // Find the number of levels in the machine topology. While we're at it,
1377  // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will
1378  // try to get more accurate values later by explicitly counting them,
1379  // but get reasonable defaults now, in case we return early.
1380  //
1381  int level;
1382  int threadLevel = -1;
1383  int coreLevel = -1;
1384  int pkgLevel = -1;
1385  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1386 
1387  for (level = 0;; level++) {
1388  if (level > 31) {
1389  //
1390  // FIXME: Hack for DPD200163180
1391  //
1392  // If level is big then something went wrong -> exiting
1393  //
1394  // There could actually be 32 valid levels in the machine topology,
1395  // but so far, the only machine we have seen which does not exit
1396  // this loop before iteration 32 has fubar x2APIC settings.
1397  //
1398  // For now, just reject this case based upon loop trip count.
1399  //
1400  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1401  return -1;
1402  }
1403  __kmp_x86_cpuid(11, level, &buf);
1404  if (buf.ebx == 0) {
1405  if (pkgLevel < 0) {
1406  //
1407  // Will infer nPackages from __kmp_xproc
1408  //
1409  pkgLevel = level;
1410  level++;
1411  }
1412  break;
1413  }
1414  int kind = (buf.ecx >> 8) & 0xff;
1415  if (kind == 1) {
1416  //
1417  // SMT level
1418  //
1419  threadLevel = level;
1420  coreLevel = -1;
1421  pkgLevel = -1;
1422  __kmp_nThreadsPerCore = buf.ebx & 0xff;
1423  if (__kmp_nThreadsPerCore == 0) {
1424  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1425  return -1;
1426  }
1427  }
1428  else if (kind == 2) {
1429  //
1430  // core level
1431  //
1432  coreLevel = level;
1433  pkgLevel = -1;
1434  nCoresPerPkg = buf.ebx & 0xff;
1435  if (nCoresPerPkg == 0) {
1436  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1437  return -1;
1438  }
1439  }
1440  else {
1441  if (level <= 0) {
1442  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1443  return -1;
1444  }
1445  if (pkgLevel >= 0) {
1446  continue;
1447  }
1448  pkgLevel = level;
1449  nPackages = buf.ebx & 0xff;
1450  if (nPackages == 0) {
1451  *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1452  return -1;
1453  }
1454  }
1455  }
1456  int depth = level;
1457 
1458  //
1459  // In the above loop, "level" was counted from the finest level (usually
1460  // thread) to the coarsest. The caller expects that we will place the
1461  // labels in (*address2os)[].first.labels[] in the inverse order, so
1462  // we need to invert the vars saying which level means what.
1463  //
1464  if (threadLevel >= 0) {
1465  threadLevel = depth - threadLevel - 1;
1466  }
1467  if (coreLevel >= 0) {
1468  coreLevel = depth - coreLevel - 1;
1469  }
1470  KMP_DEBUG_ASSERT(pkgLevel >= 0);
1471  pkgLevel = depth - pkgLevel - 1;
1472 
1473  //
1474  // The algorithm used starts by setting the affinity to each available
1475  // thread and retrieving info from the cpuid instruction, so if we are
1476  // not capable of calling __kmp_get_system_affinity() and
1477  // _kmp_get_system_affinity(), then we need to do something else - use
1478  // the defaults that we calculated from issuing cpuid without binding
1479  // to each proc.
1480  //
1481  if (! KMP_AFFINITY_CAPABLE())
1482  {
1483  //
1484  // Hack to try and infer the machine topology using only the data
1485  // available from cpuid on the current thread, and __kmp_xproc.
1486  //
1487  KMP_ASSERT(__kmp_affinity_type == affinity_none);
1488 
1489  __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1490  nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1491  if (__kmp_affinity_verbose) {
1492  KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1493  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1494  if (__kmp_affinity_uniform_topology()) {
1495  KMP_INFORM(Uniform, "KMP_AFFINITY");
1496  } else {
1497  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1498  }
1499  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1500  __kmp_nThreadsPerCore, __kmp_ncores);
1501  }
1502  return 0;
1503  }
1504 
1505  //
1506  //
1507  // From here on, we can assume that it is safe to call
1508  // __kmp_get_system_affinity() and __kmp_set_system_affinity(),
1509  // even if __kmp_affinity_type = affinity_none.
1510  //
1511 
1512  //
1513  // Save the affinity mask for the current thread.
1514  //
1515  kmp_affin_mask_t *oldMask;
1516  KMP_CPU_ALLOC(oldMask);
1517  __kmp_get_system_affinity(oldMask, TRUE);
1518 
1519  //
1520  // Allocate the data structure to be returned.
1521  //
1522  AddrUnsPair *retval = (AddrUnsPair *)
1523  __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1524 
1525  //
1526  // Run through each of the available contexts, binding the current thread
1527  // to it, and obtaining the pertinent information using the cpuid instr.
1528  //
1529  unsigned int proc;
1530  int nApics = 0;
1531  for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) {
1532  //
1533  // Skip this proc if it is not included in the machine model.
1534  //
1535  if (! KMP_CPU_ISSET(proc, fullMask)) {
1536  continue;
1537  }
1538  KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1539 
1540  __kmp_affinity_bind_thread(proc);
1541 
1542  //
1543  // Extrach the labels for each level in the machine topology map
1544  // from the Apic ID.
1545  //
1546  Address addr(depth);
1547  int prev_shift = 0;
1548 
1549  for (level = 0; level < depth; level++) {
1550  __kmp_x86_cpuid(11, level, &buf);
1551  unsigned apicId = buf.edx;
1552  if (buf.ebx == 0) {
1553  if (level != depth - 1) {
1554  KMP_CPU_FREE(oldMask);
1555  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1556  return -1;
1557  }
1558  addr.labels[depth - level - 1] = apicId >> prev_shift;
1559  level++;
1560  break;
1561  }
1562  int shift = buf.eax & 0x1f;
1563  int mask = (1 << shift) - 1;
1564  addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1565  prev_shift = shift;
1566  }
1567  if (level != depth) {
1568  KMP_CPU_FREE(oldMask);
1569  *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1570  return -1;
1571  }
1572 
1573  retval[nApics] = AddrUnsPair(addr, proc);
1574  nApics++;
1575  }
1576 
1577  //
1578  // We've collected all the info we need.
1579  // Restore the old affinity mask for this thread.
1580  //
1581  __kmp_set_system_affinity(oldMask, TRUE);
1582 
1583  //
1584  // If there's only one thread context to bind to, return now.
1585  //
1586  KMP_ASSERT(nApics > 0);
1587  if (nApics == 1) {
1588  __kmp_ncores = nPackages = 1;
1589  __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1590  if (__kmp_affinity_verbose) {
1591  char buf[KMP_AFFIN_MASK_PRINT_LEN];
1592  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1593 
1594  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1595  if (__kmp_affinity_respect_mask) {
1596  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1597  } else {
1598  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1599  }
1600  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1601  KMP_INFORM(Uniform, "KMP_AFFINITY");
1602  KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1603  __kmp_nThreadsPerCore, __kmp_ncores);
1604  }
1605 
1606  if (__kmp_affinity_type == affinity_none) {
1607  __kmp_free(retval);
1608  KMP_CPU_FREE(oldMask);
1609  return 0;
1610  }
1611 
1612  //
1613  // Form an Address object which only includes the package level.
1614  //
1615  Address addr(1);
1616  addr.labels[0] = retval[0].first.labels[pkgLevel];
1617  retval[0].first = addr;
1618 
1619  if (__kmp_affinity_gran_levels < 0) {
1620  __kmp_affinity_gran_levels = 0;
1621  }
1622 
1623  if (__kmp_affinity_verbose) {
1624  __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1625  }
1626 
1627  *address2os = retval;
1628  KMP_CPU_FREE(oldMask);
1629  return 1;
1630  }
1631 
1632  //
1633  // Sort the table by physical Id.
1634  //
1635  qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1636 
1637  //
1638  // Find the radix at each of the levels.
1639  //
1640  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1641  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1642  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1643  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1644  for (level = 0; level < depth; level++) {
1645  totals[level] = 1;
1646  maxCt[level] = 1;
1647  counts[level] = 1;
1648  last[level] = retval[0].first.labels[level];
1649  }
1650 
1651  //
1652  // From here on, the iteration variable "level" runs from the finest
1653  // level to the coarsest, i.e. we iterate forward through
1654  // (*address2os)[].first.labels[] - in the previous loops, we iterated
1655  // backwards.
1656  //
1657  for (proc = 1; (int)proc < nApics; proc++) {
1658  int level;
1659  for (level = 0; level < depth; level++) {
1660  if (retval[proc].first.labels[level] != last[level]) {
1661  int j;
1662  for (j = level + 1; j < depth; j++) {
1663  totals[j]++;
1664  counts[j] = 1;
1665  // The line below causes printing incorrect topology information
1666  // in case the max value for some level (maxCt[level]) is encountered earlier than
1667  // some less value while going through the array.
1668  // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2
1669  // whereas it must be 4.
1670  // TODO!!! Check if it can be commented safely
1671  //maxCt[j] = 1;
1672  last[j] = retval[proc].first.labels[j];
1673  }
1674  totals[level]++;
1675  counts[level]++;
1676  if (counts[level] > maxCt[level]) {
1677  maxCt[level] = counts[level];
1678  }
1679  last[level] = retval[proc].first.labels[level];
1680  break;
1681  }
1682  else if (level == depth - 1) {
1683  __kmp_free(last);
1684  __kmp_free(maxCt);
1685  __kmp_free(counts);
1686  __kmp_free(totals);
1687  __kmp_free(retval);
1688  KMP_CPU_FREE(oldMask);
1689  *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1690  return -1;
1691  }
1692  }
1693  }
1694 
1695  //
1696  // When affinity is off, this routine will still be called to set
1697  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
1698  // nCoresPerPkg, & nPackages. Make sure all these vars are set
1699  // correctly, and return if affinity is not enabled.
1700  //
1701  if (threadLevel >= 0) {
1702  __kmp_nThreadsPerCore = maxCt[threadLevel];
1703  }
1704  else {
1705  __kmp_nThreadsPerCore = 1;
1706  }
1707  nPackages = totals[pkgLevel];
1708 
1709  if (coreLevel >= 0) {
1710  __kmp_ncores = totals[coreLevel];
1711  nCoresPerPkg = maxCt[coreLevel];
1712  }
1713  else {
1714  __kmp_ncores = nPackages;
1715  nCoresPerPkg = 1;
1716  }
1717 
1718  //
1719  // Check to see if the machine topology is uniform
1720  //
1721  unsigned prod = maxCt[0];
1722  for (level = 1; level < depth; level++) {
1723  prod *= maxCt[level];
1724  }
1725  bool uniform = (prod == totals[level - 1]);
1726 
1727  //
1728  // Print the machine topology summary.
1729  //
1730  if (__kmp_affinity_verbose) {
1731  char mask[KMP_AFFIN_MASK_PRINT_LEN];
1732  __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1733 
1734  KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1735  if (__kmp_affinity_respect_mask) {
1736  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1737  } else {
1738  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1739  }
1740  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1741  if (uniform) {
1742  KMP_INFORM(Uniform, "KMP_AFFINITY");
1743  } else {
1744  KMP_INFORM(NonUniform, "KMP_AFFINITY");
1745  }
1746 
1747  kmp_str_buf_t buf;
1748  __kmp_str_buf_init(&buf);
1749 
1750  __kmp_str_buf_print(&buf, "%d", totals[0]);
1751  for (level = 1; level <= pkgLevel; level++) {
1752  __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1753  }
1754  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1755  __kmp_nThreadsPerCore, __kmp_ncores);
1756 
1757  __kmp_str_buf_free(&buf);
1758  }
1759 
1760  if (__kmp_affinity_type == affinity_none) {
1761  __kmp_free(last);
1762  __kmp_free(maxCt);
1763  __kmp_free(counts);
1764  __kmp_free(totals);
1765  __kmp_free(retval);
1766  KMP_CPU_FREE(oldMask);
1767  return 0;
1768  }
1769 
1770  //
1771  // Find any levels with radiix 1, and remove them from the map
1772  // (except for the package level).
1773  //
1774  int new_depth = 0;
1775  for (level = 0; level < depth; level++) {
1776  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1777  continue;
1778  }
1779  new_depth++;
1780  }
1781 
1782  //
1783  // If we are removing any levels, allocate a new vector to return,
1784  // and copy the relevant information to it.
1785  //
1786  if (new_depth != depth) {
1787  AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate(
1788  sizeof(AddrUnsPair) * nApics);
1789  for (proc = 0; (int)proc < nApics; proc++) {
1790  Address addr(new_depth);
1791  new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1792  }
1793  int new_level = 0;
1794  for (level = 0; level < depth; level++) {
1795  if ((maxCt[level] == 1) && (level != pkgLevel)) {
1796  if (level == threadLevel) {
1797  threadLevel = -1;
1798  }
1799  else if ((threadLevel >= 0) && (level < threadLevel)) {
1800  threadLevel--;
1801  }
1802  if (level == coreLevel) {
1803  coreLevel = -1;
1804  }
1805  else if ((coreLevel >= 0) && (level < coreLevel)) {
1806  coreLevel--;
1807  }
1808  if (level < pkgLevel) {
1809  pkgLevel--;
1810  }
1811  continue;
1812  }
1813  for (proc = 0; (int)proc < nApics; proc++) {
1814  new_retval[proc].first.labels[new_level]
1815  = retval[proc].first.labels[level];
1816  }
1817  new_level++;
1818  }
1819 
1820  __kmp_free(retval);
1821  retval = new_retval;
1822  depth = new_depth;
1823  }
1824 
1825  if (__kmp_affinity_gran_levels < 0) {
1826  //
1827  // Set the granularity level based on what levels are modeled
1828  // in the machine topology map.
1829  //
1830  __kmp_affinity_gran_levels = 0;
1831  if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1832  __kmp_affinity_gran_levels++;
1833  }
1834  if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1835  __kmp_affinity_gran_levels++;
1836  }
1837  if (__kmp_affinity_gran > affinity_gran_package) {
1838  __kmp_affinity_gran_levels++;
1839  }
1840  }
1841 
1842  if (__kmp_affinity_verbose) {
1843  __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel,
1844  coreLevel, threadLevel);
1845  }
1846 
1847  __kmp_free(last);
1848  __kmp_free(maxCt);
1849  __kmp_free(counts);
1850  __kmp_free(totals);
1851  KMP_CPU_FREE(oldMask);
1852  *address2os = retval;
1853  return depth;
1854 }
1855 
1856 
1857 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1858 
1859 
1860 #define osIdIndex 0
1861 #define threadIdIndex 1
1862 #define coreIdIndex 2
1863 #define pkgIdIndex 3
1864 #define nodeIdIndex 4
1865 
1866 typedef unsigned *ProcCpuInfo;
1867 static unsigned maxIndex = pkgIdIndex;
1868 
1869 
1870 static int
1871 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b)
1872 {
1873  const unsigned *aa = (const unsigned *)a;
1874  const unsigned *bb = (const unsigned *)b;
1875  if (aa[osIdIndex] < bb[osIdIndex]) return -1;
1876  if (aa[osIdIndex] > bb[osIdIndex]) return 1;
1877  return 0;
1878 };
1879 
1880 
1881 static int
1882 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b)
1883 {
1884  unsigned i;
1885  const unsigned *aa = *((const unsigned **)a);
1886  const unsigned *bb = *((const unsigned **)b);
1887  for (i = maxIndex; ; i--) {
1888  if (aa[i] < bb[i]) return -1;
1889  if (aa[i] > bb[i]) return 1;
1890  if (i == osIdIndex) break;
1891  }
1892  return 0;
1893 }
1894 
1895 
1896 //
1897 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
1898 // affinity map.
1899 //
1900 static int
1901 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
1902  kmp_i18n_id_t *const msg_id, FILE *f)
1903 {
1904  *address2os = NULL;
1905  *msg_id = kmp_i18n_null;
1906 
1907  //
1908  // Scan of the file, and count the number of "processor" (osId) fields,
1909  // and find the highest value of <n> for a node_<n> field.
1910  //
1911  char buf[256];
1912  unsigned num_records = 0;
1913  while (! feof(f)) {
1914  buf[sizeof(buf) - 1] = 1;
1915  if (! fgets(buf, sizeof(buf), f)) {
1916  //
1917  // Read errors presumably because of EOF
1918  //
1919  break;
1920  }
1921 
1922  char s1[] = "processor";
1923  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
1924  num_records++;
1925  continue;
1926  }
1927 
1928  //
1929  // FIXME - this will match "node_<n> <garbage>"
1930  //
1931  unsigned level;
1932  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
1933  if (nodeIdIndex + level >= maxIndex) {
1934  maxIndex = nodeIdIndex + level;
1935  }
1936  continue;
1937  }
1938  }
1939 
1940  //
1941  // Check for empty file / no valid processor records, or too many.
1942  // The number of records can't exceed the number of valid bits in the
1943  // affinity mask.
1944  //
1945  if (num_records == 0) {
1946  *line = 0;
1947  *msg_id = kmp_i18n_str_NoProcRecords;
1948  return -1;
1949  }
1950  if (num_records > (unsigned)__kmp_xproc) {
1951  *line = 0;
1952  *msg_id = kmp_i18n_str_TooManyProcRecords;
1953  return -1;
1954  }
1955 
1956  //
1957  // Set the file pointer back to the begginning, so that we can scan the
1958  // file again, this time performing a full parse of the data.
1959  // Allocate a vector of ProcCpuInfo object, where we will place the data.
1960  // Adding an extra element at the end allows us to remove a lot of extra
1961  // checks for termination conditions.
1962  //
1963  if (fseek(f, 0, SEEK_SET) != 0) {
1964  *line = 0;
1965  *msg_id = kmp_i18n_str_CantRewindCpuinfo;
1966  return -1;
1967  }
1968 
1969  //
1970  // Allocate the array of records to store the proc info in. The dummy
1971  // element at the end makes the logic in filling them out easier to code.
1972  //
1973  unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1)
1974  * sizeof(unsigned *));
1975  unsigned i;
1976  for (i = 0; i <= num_records; i++) {
1977  threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1)
1978  * sizeof(unsigned));
1979  }
1980 
1981 #define CLEANUP_THREAD_INFO \
1982  for (i = 0; i <= num_records; i++) { \
1983  __kmp_free(threadInfo[i]); \
1984  } \
1985  __kmp_free(threadInfo);
1986 
1987  //
1988  // A value of UINT_MAX means that we didn't find the field
1989  //
1990  unsigned __index;
1991 
1992 #define INIT_PROC_INFO(p) \
1993  for (__index = 0; __index <= maxIndex; __index++) { \
1994  (p)[__index] = UINT_MAX; \
1995  }
1996 
1997  for (i = 0; i <= num_records; i++) {
1998  INIT_PROC_INFO(threadInfo[i]);
1999  }
2000 
2001  unsigned num_avail = 0;
2002  *line = 0;
2003  while (! feof(f)) {
2004  //
2005  // Create an inner scoping level, so that all the goto targets at the
2006  // end of the loop appear in an outer scoping level. This avoids
2007  // warnings about jumping past an initialization to a target in the
2008  // same block.
2009  //
2010  {
2011  buf[sizeof(buf) - 1] = 1;
2012  bool long_line = false;
2013  if (! fgets(buf, sizeof(buf), f)) {
2014  //
2015  // Read errors presumably because of EOF
2016  //
2017  // If there is valid data in threadInfo[num_avail], then fake
2018  // a blank line in ensure that the last address gets parsed.
2019  //
2020  bool valid = false;
2021  for (i = 0; i <= maxIndex; i++) {
2022  if (threadInfo[num_avail][i] != UINT_MAX) {
2023  valid = true;
2024  }
2025  }
2026  if (! valid) {
2027  break;
2028  }
2029  buf[0] = 0;
2030  } else if (!buf[sizeof(buf) - 1]) {
2031  //
2032  // The line is longer than the buffer. Set a flag and don't
2033  // emit an error if we were going to ignore the line, anyway.
2034  //
2035  long_line = true;
2036 
2037 #define CHECK_LINE \
2038  if (long_line) { \
2039  CLEANUP_THREAD_INFO; \
2040  *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2041  return -1; \
2042  }
2043  }
2044  (*line)++;
2045 
2046  char s1[] = "processor";
2047  if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2048  CHECK_LINE;
2049  char *p = strchr(buf + sizeof(s1) - 1, ':');
2050  unsigned val;
2051  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2052  if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field;
2053  threadInfo[num_avail][osIdIndex] = val;
2054 #if KMP_OS_LINUX && USE_SYSFS_INFO
2055  char path[256];
2056  KMP_SNPRINTF(path, sizeof(path),
2057  "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2058  threadInfo[num_avail][osIdIndex]);
2059  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2060 
2061  KMP_SNPRINTF(path, sizeof(path),
2062  "/sys/devices/system/cpu/cpu%u/topology/core_id",
2063  threadInfo[num_avail][osIdIndex]);
2064  __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2065  continue;
2066 #else
2067  }
2068  char s2[] = "physical id";
2069  if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2070  CHECK_LINE;
2071  char *p = strchr(buf + sizeof(s2) - 1, ':');
2072  unsigned val;
2073  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2074  if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field;
2075  threadInfo[num_avail][pkgIdIndex] = val;
2076  continue;
2077  }
2078  char s3[] = "core id";
2079  if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2080  CHECK_LINE;
2081  char *p = strchr(buf + sizeof(s3) - 1, ':');
2082  unsigned val;
2083  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2084  if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field;
2085  threadInfo[num_avail][coreIdIndex] = val;
2086  continue;
2087 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2088  }
2089  char s4[] = "thread id";
2090  if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2091  CHECK_LINE;
2092  char *p = strchr(buf + sizeof(s4) - 1, ':');
2093  unsigned val;
2094  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2095  if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field;
2096  threadInfo[num_avail][threadIdIndex] = val;
2097  continue;
2098  }
2099  unsigned level;
2100  if (KMP_SSCANF(buf, "node_%d id", &level) == 1) {
2101  CHECK_LINE;
2102  char *p = strchr(buf + sizeof(s4) - 1, ':');
2103  unsigned val;
2104  if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val;
2105  KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2106  if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field;
2107  threadInfo[num_avail][nodeIdIndex + level] = val;
2108  continue;
2109  }
2110 
2111  //
2112  // We didn't recognize the leading token on the line.
2113  // There are lots of leading tokens that we don't recognize -
2114  // if the line isn't empty, go on to the next line.
2115  //
2116  if ((*buf != 0) && (*buf != '\n')) {
2117  //
2118  // If the line is longer than the buffer, read characters
2119  // until we find a newline.
2120  //
2121  if (long_line) {
2122  int ch;
2123  while (((ch = fgetc(f)) != EOF) && (ch != '\n'));
2124  }
2125  continue;
2126  }
2127 
2128  //
2129  // A newline has signalled the end of the processor record.
2130  // Check that there aren't too many procs specified.
2131  //
2132  if ((int)num_avail == __kmp_xproc) {
2133  CLEANUP_THREAD_INFO;
2134  *msg_id = kmp_i18n_str_TooManyEntries;
2135  return -1;
2136  }
2137 
2138  //
2139  // Check for missing fields. The osId field must be there, and we
2140  // currently require that the physical id field is specified, also.
2141  //
2142  if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2143  CLEANUP_THREAD_INFO;
2144  *msg_id = kmp_i18n_str_MissingProcField;
2145  return -1;
2146  }
2147  if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2148  CLEANUP_THREAD_INFO;
2149  *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2150  return -1;
2151  }
2152 
2153  //
2154  // Skip this proc if it is not included in the machine model.
2155  //
2156  if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) {
2157  INIT_PROC_INFO(threadInfo[num_avail]);
2158  continue;
2159  }
2160 
2161  //
2162  // We have a successful parse of this proc's info.
2163  // Increment the counter, and prepare for the next proc.
2164  //
2165  num_avail++;
2166  KMP_ASSERT(num_avail <= num_records);
2167  INIT_PROC_INFO(threadInfo[num_avail]);
2168  }
2169  continue;
2170 
2171  no_val:
2172  CLEANUP_THREAD_INFO;
2173  *msg_id = kmp_i18n_str_MissingValCpuinfo;
2174  return -1;
2175 
2176  dup_field:
2177  CLEANUP_THREAD_INFO;
2178  *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2179  return -1;
2180  }
2181  *line = 0;
2182 
2183 # if KMP_MIC && REDUCE_TEAM_SIZE
2184  unsigned teamSize = 0;
2185 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2186 
2187  // check for num_records == __kmp_xproc ???
2188 
2189  //
2190  // If there's only one thread context to bind to, form an Address object
2191  // with depth 1 and return immediately (or, if affinity is off, set
2192  // address2os to NULL and return).
2193  //
2194  // If it is configured to omit the package level when there is only a
2195  // single package, the logic at the end of this routine won't work if
2196  // there is only a single thread - it would try to form an Address
2197  // object with depth 0.
2198  //
2199  KMP_ASSERT(num_avail > 0);
2200  KMP_ASSERT(num_avail <= num_records);
2201  if (num_avail == 1) {
2202  __kmp_ncores = 1;
2203  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2204  if (__kmp_affinity_verbose) {
2205  if (! KMP_AFFINITY_CAPABLE()) {
2206  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2207  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2208  KMP_INFORM(Uniform, "KMP_AFFINITY");
2209  }
2210  else {
2211  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2212  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2213  fullMask);
2214  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2215  if (__kmp_affinity_respect_mask) {
2216  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2217  } else {
2218  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2219  }
2220  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2221  KMP_INFORM(Uniform, "KMP_AFFINITY");
2222  }
2223  int index;
2224  kmp_str_buf_t buf;
2225  __kmp_str_buf_init(&buf);
2226  __kmp_str_buf_print(&buf, "1");
2227  for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2228  __kmp_str_buf_print(&buf, " x 1");
2229  }
2230  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2231  __kmp_str_buf_free(&buf);
2232  }
2233 
2234  if (__kmp_affinity_type == affinity_none) {
2235  CLEANUP_THREAD_INFO;
2236  return 0;
2237  }
2238 
2239  *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair));
2240  Address addr(1);
2241  addr.labels[0] = threadInfo[0][pkgIdIndex];
2242  (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2243 
2244  if (__kmp_affinity_gran_levels < 0) {
2245  __kmp_affinity_gran_levels = 0;
2246  }
2247 
2248  if (__kmp_affinity_verbose) {
2249  __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2250  }
2251 
2252  CLEANUP_THREAD_INFO;
2253  return 1;
2254  }
2255 
2256  //
2257  // Sort the threadInfo table by physical Id.
2258  //
2259  qsort(threadInfo, num_avail, sizeof(*threadInfo),
2260  __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2261 
2262  //
2263  // The table is now sorted by pkgId / coreId / threadId, but we really
2264  // don't know the radix of any of the fields. pkgId's may be sparsely
2265  // assigned among the chips on a system. Although coreId's are usually
2266  // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned
2267  // [0..threadsPerCore-1], we don't want to make any such assumptions.
2268  //
2269  // For that matter, we don't know what coresPerPkg and threadsPerCore
2270  // (or the total # packages) are at this point - we want to determine
2271  // that now. We only have an upper bound on the first two figures.
2272  //
2273  unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1)
2274  * sizeof(unsigned));
2275  unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1)
2276  * sizeof(unsigned));
2277  unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1)
2278  * sizeof(unsigned));
2279  unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1)
2280  * sizeof(unsigned));
2281 
2282  bool assign_thread_ids = false;
2283  unsigned threadIdCt;
2284  unsigned index;
2285 
2286  restart_radix_check:
2287  threadIdCt = 0;
2288 
2289  //
2290  // Initialize the counter arrays with data from threadInfo[0].
2291  //
2292  if (assign_thread_ids) {
2293  if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2294  threadInfo[0][threadIdIndex] = threadIdCt++;
2295  }
2296  else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2297  threadIdCt = threadInfo[0][threadIdIndex] + 1;
2298  }
2299  }
2300  for (index = 0; index <= maxIndex; index++) {
2301  counts[index] = 1;
2302  maxCt[index] = 1;
2303  totals[index] = 1;
2304  lastId[index] = threadInfo[0][index];;
2305  }
2306 
2307  //
2308  // Run through the rest of the OS procs.
2309  //
2310  for (i = 1; i < num_avail; i++) {
2311  //
2312  // Find the most significant index whose id differs
2313  // from the id for the previous OS proc.
2314  //
2315  for (index = maxIndex; index >= threadIdIndex; index--) {
2316  if (assign_thread_ids && (index == threadIdIndex)) {
2317  //
2318  // Auto-assign the thread id field if it wasn't specified.
2319  //
2320  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2321  threadInfo[i][threadIdIndex] = threadIdCt++;
2322  }
2323 
2324  //
2325  // Aparrently the thread id field was specified for some
2326  // entries and not others. Start the thread id counter
2327  // off at the next higher thread id.
2328  //
2329  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2330  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2331  }
2332  }
2333  if (threadInfo[i][index] != lastId[index]) {
2334  //
2335  // Run through all indices which are less significant,
2336  // and reset the counts to 1.
2337  //
2338  // At all levels up to and including index, we need to
2339  // increment the totals and record the last id.
2340  //
2341  unsigned index2;
2342  for (index2 = threadIdIndex; index2 < index; index2++) {
2343  totals[index2]++;
2344  if (counts[index2] > maxCt[index2]) {
2345  maxCt[index2] = counts[index2];
2346  }
2347  counts[index2] = 1;
2348  lastId[index2] = threadInfo[i][index2];
2349  }
2350  counts[index]++;
2351  totals[index]++;
2352  lastId[index] = threadInfo[i][index];
2353 
2354  if (assign_thread_ids && (index > threadIdIndex)) {
2355 
2356 # if KMP_MIC && REDUCE_TEAM_SIZE
2357  //
2358  // The default team size is the total #threads in the machine
2359  // minus 1 thread for every core that has 3 or more threads.
2360  //
2361  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2362 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2363 
2364  //
2365  // Restart the thread counter, as we are on a new core.
2366  //
2367  threadIdCt = 0;
2368 
2369  //
2370  // Auto-assign the thread id field if it wasn't specified.
2371  //
2372  if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2373  threadInfo[i][threadIdIndex] = threadIdCt++;
2374  }
2375 
2376  //
2377  // Aparrently the thread id field was specified for some
2378  // entries and not others. Start the thread id counter
2379  // off at the next higher thread id.
2380  //
2381  else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2382  threadIdCt = threadInfo[i][threadIdIndex] + 1;
2383  }
2384  }
2385  break;
2386  }
2387  }
2388  if (index < threadIdIndex) {
2389  //
2390  // If thread ids were specified, it is an error if they are not
2391  // unique. Also, check that we waven't already restarted the
2392  // loop (to be safe - shouldn't need to).
2393  //
2394  if ((threadInfo[i][threadIdIndex] != UINT_MAX)
2395  || assign_thread_ids) {
2396  __kmp_free(lastId);
2397  __kmp_free(totals);
2398  __kmp_free(maxCt);
2399  __kmp_free(counts);
2400  CLEANUP_THREAD_INFO;
2401  *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2402  return -1;
2403  }
2404 
2405  //
2406  // If the thread ids were not specified and we see entries
2407  // entries that are duplicates, start the loop over and
2408  // assign the thread ids manually.
2409  //
2410  assign_thread_ids = true;
2411  goto restart_radix_check;
2412  }
2413  }
2414 
2415 # if KMP_MIC && REDUCE_TEAM_SIZE
2416  //
2417  // The default team size is the total #threads in the machine
2418  // minus 1 thread for every core that has 3 or more threads.
2419  //
2420  teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 );
2421 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2422 
2423  for (index = threadIdIndex; index <= maxIndex; index++) {
2424  if (counts[index] > maxCt[index]) {
2425  maxCt[index] = counts[index];
2426  }
2427  }
2428 
2429  __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2430  nCoresPerPkg = maxCt[coreIdIndex];
2431  nPackages = totals[pkgIdIndex];
2432 
2433  //
2434  // Check to see if the machine topology is uniform
2435  //
2436  unsigned prod = totals[maxIndex];
2437  for (index = threadIdIndex; index < maxIndex; index++) {
2438  prod *= maxCt[index];
2439  }
2440  bool uniform = (prod == totals[threadIdIndex]);
2441 
2442  //
2443  // When affinity is off, this routine will still be called to set
2444  // __kmp_ncores, as well as __kmp_nThreadsPerCore,
2445  // nCoresPerPkg, & nPackages. Make sure all these vars are set
2446  // correctly, and return now if affinity is not enabled.
2447  //
2448  __kmp_ncores = totals[coreIdIndex];
2449 
2450  if (__kmp_affinity_verbose) {
2451  if (! KMP_AFFINITY_CAPABLE()) {
2452  KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2453  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2454  if (uniform) {
2455  KMP_INFORM(Uniform, "KMP_AFFINITY");
2456  } else {
2457  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2458  }
2459  }
2460  else {
2461  char buf[KMP_AFFIN_MASK_PRINT_LEN];
2462  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask);
2463  KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2464  if (__kmp_affinity_respect_mask) {
2465  KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2466  } else {
2467  KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2468  }
2469  KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2470  if (uniform) {
2471  KMP_INFORM(Uniform, "KMP_AFFINITY");
2472  } else {
2473  KMP_INFORM(NonUniform, "KMP_AFFINITY");
2474  }
2475  }
2476  kmp_str_buf_t buf;
2477  __kmp_str_buf_init(&buf);
2478 
2479  __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2480  for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2481  __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2482  }
2483  KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2484  maxCt[threadIdIndex], __kmp_ncores);
2485 
2486  __kmp_str_buf_free(&buf);
2487  }
2488 
2489 # if KMP_MIC && REDUCE_TEAM_SIZE
2490  //
2491  // Set the default team size.
2492  //
2493  if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2494  __kmp_dflt_team_nth = teamSize;
2495  KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n",
2496  __kmp_dflt_team_nth));
2497  }
2498 # endif // KMP_MIC && REDUCE_TEAM_SIZE
2499 
2500  if (__kmp_affinity_type == affinity_none) {
2501  __kmp_free(lastId);
2502  __kmp_free(totals);
2503  __kmp_free(maxCt);
2504  __kmp_free(counts);
2505  CLEANUP_THREAD_INFO;
2506  return 0;
2507  }
2508 
2509  //
2510  // Count the number of levels which have more nodes at that level than
2511  // at the parent's level (with there being an implicit root node of
2512  // the top level). This is equivalent to saying that there is at least
2513  // one node at this level which has a sibling. These levels are in the
2514  // map, and the package level is always in the map.
2515  //
2516  bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2517  int level = 0;
2518  for (index = threadIdIndex; index < maxIndex; index++) {
2519  KMP_ASSERT(totals[index] >= totals[index + 1]);
2520  inMap[index] = (totals[index] > totals[index + 1]);
2521  }
2522  inMap[maxIndex] = (totals[maxIndex] > 1);
2523  inMap[pkgIdIndex] = true;
2524 
2525  int depth = 0;
2526  for (index = threadIdIndex; index <= maxIndex; index++) {
2527  if (inMap[index]) {
2528  depth++;
2529  }
2530  }
2531  KMP_ASSERT(depth > 0);
2532 
2533  //
2534  // Construct the data structure that is to be returned.
2535  //
2536  *address2os = (AddrUnsPair*)
2537  __kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2538  int pkgLevel = -1;
2539  int coreLevel = -1;
2540  int threadLevel = -1;
2541 
2542  for (i = 0; i < num_avail; ++i) {
2543  Address addr(depth);
2544  unsigned os = threadInfo[i][osIdIndex];
2545  int src_index;
2546  int dst_index = 0;
2547 
2548  for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2549  if (! inMap[src_index]) {
2550  continue;
2551  }
2552  addr.labels[dst_index] = threadInfo[i][src_index];
2553  if (src_index == pkgIdIndex) {
2554  pkgLevel = dst_index;
2555  }
2556  else if (src_index == coreIdIndex) {
2557  coreLevel = dst_index;
2558  }
2559  else if (src_index == threadIdIndex) {
2560  threadLevel = dst_index;
2561  }
2562  dst_index++;
2563  }
2564  (*address2os)[i] = AddrUnsPair(addr, os);
2565  }
2566 
2567  if (__kmp_affinity_gran_levels < 0) {
2568  //
2569  // Set the granularity level based on what levels are modeled
2570  // in the machine topology map.
2571  //
2572  unsigned src_index;
2573  __kmp_affinity_gran_levels = 0;
2574  for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2575  if (! inMap[src_index]) {
2576  continue;
2577  }
2578  switch (src_index) {
2579  case threadIdIndex:
2580  if (__kmp_affinity_gran > affinity_gran_thread) {
2581  __kmp_affinity_gran_levels++;
2582  }
2583 
2584  break;
2585  case coreIdIndex:
2586  if (__kmp_affinity_gran > affinity_gran_core) {
2587  __kmp_affinity_gran_levels++;
2588  }
2589  break;
2590 
2591  case pkgIdIndex:
2592  if (__kmp_affinity_gran > affinity_gran_package) {
2593  __kmp_affinity_gran_levels++;
2594  }
2595  break;
2596  }
2597  }
2598  }
2599 
2600  if (__kmp_affinity_verbose) {
2601  __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2602  coreLevel, threadLevel);
2603  }
2604 
2605  __kmp_free(inMap);
2606  __kmp_free(lastId);
2607  __kmp_free(totals);
2608  __kmp_free(maxCt);
2609  __kmp_free(counts);
2610  CLEANUP_THREAD_INFO;
2611  return depth;
2612 }
2613 
2614 
2615 //
2616 // Create and return a table of affinity masks, indexed by OS thread ID.
2617 // This routine handles OR'ing together all the affinity masks of threads
2618 // that are sufficiently close, if granularity > fine.
2619 //
2620 static kmp_affin_mask_t *
2621 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique,
2622  AddrUnsPair *address2os, unsigned numAddrs)
2623 {
2624  //
2625  // First form a table of affinity masks in order of OS thread id.
2626  //
2627  unsigned depth;
2628  unsigned maxOsId;
2629  unsigned i;
2630 
2631  KMP_ASSERT(numAddrs > 0);
2632  depth = address2os[0].first.depth;
2633 
2634  maxOsId = 0;
2635  for (i = 0; i < numAddrs; i++) {
2636  unsigned osId = address2os[i].second;
2637  if (osId > maxOsId) {
2638  maxOsId = osId;
2639  }
2640  }
2641  kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate(
2642  (maxOsId + 1) * __kmp_affin_mask_size);
2643 
2644  //
2645  // Sort the address2os table according to physical order. Doing so
2646  // will put all threads on the same core/package/node in consecutive
2647  // locations.
2648  //
2649  qsort(address2os, numAddrs, sizeof(*address2os),
2650  __kmp_affinity_cmp_Address_labels);
2651 
2652  KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2653  if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2654  KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2655  }
2656  if (__kmp_affinity_gran_levels >= (int)depth) {
2657  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2658  && (__kmp_affinity_type != affinity_none))) {
2659  KMP_WARNING(AffThreadsMayMigrate);
2660  }
2661  }
2662 
2663  //
2664  // Run through the table, forming the masks for all threads on each
2665  // core. Threads on the same core will have identical "Address"
2666  // objects, not considering the last level, which must be the thread
2667  // id. All threads on a core will appear consecutively.
2668  //
2669  unsigned unique = 0;
2670  unsigned j = 0; // index of 1st thread on core
2671  unsigned leader = 0;
2672  Address *leaderAddr = &(address2os[0].first);
2673  kmp_affin_mask_t *sum
2674  = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
2675  KMP_CPU_ZERO(sum);
2676  KMP_CPU_SET(address2os[0].second, sum);
2677  for (i = 1; i < numAddrs; i++) {
2678  //
2679  // If this thread is sufficiently close to the leader (within the
2680  // granularity setting), then set the bit for this os thread in the
2681  // affinity mask for this group, and go on to the next thread.
2682  //
2683  if (leaderAddr->isClose(address2os[i].first,
2684  __kmp_affinity_gran_levels)) {
2685  KMP_CPU_SET(address2os[i].second, sum);
2686  continue;
2687  }
2688 
2689  //
2690  // For every thread in this group, copy the mask to the thread's
2691  // entry in the osId2Mask table. Mark the first address as a
2692  // leader.
2693  //
2694  for (; j < i; j++) {
2695  unsigned osId = address2os[j].second;
2696  KMP_DEBUG_ASSERT(osId <= maxOsId);
2697  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2698  KMP_CPU_COPY(mask, sum);
2699  address2os[j].first.leader = (j == leader);
2700  }
2701  unique++;
2702 
2703  //
2704  // Start a new mask.
2705  //
2706  leader = i;
2707  leaderAddr = &(address2os[i].first);
2708  KMP_CPU_ZERO(sum);
2709  KMP_CPU_SET(address2os[i].second, sum);
2710  }
2711 
2712  //
2713  // For every thread in last group, copy the mask to the thread's
2714  // entry in the osId2Mask table.
2715  //
2716  for (; j < i; j++) {
2717  unsigned osId = address2os[j].second;
2718  KMP_DEBUG_ASSERT(osId <= maxOsId);
2719  kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2720  KMP_CPU_COPY(mask, sum);
2721  address2os[j].first.leader = (j == leader);
2722  }
2723  unique++;
2724 
2725  *maxIndex = maxOsId;
2726  *numUnique = unique;
2727  return osId2Mask;
2728 }
2729 
2730 
2731 //
2732 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2733 // as file-static than to try and pass them through the calling sequence of
2734 // the recursive-descent OMP_PLACES parser.
2735 //
2736 static kmp_affin_mask_t *newMasks;
2737 static int numNewMasks;
2738 static int nextNewMask;
2739 
2740 #define ADD_MASK(_mask) \
2741  { \
2742  if (nextNewMask >= numNewMasks) { \
2743  numNewMasks *= 2; \
2744  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \
2745  numNewMasks * __kmp_affin_mask_size); \
2746  } \
2747  KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2748  nextNewMask++; \
2749  }
2750 
2751 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
2752  { \
2753  if (((_osId) > _maxOsId) || \
2754  (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2755  if (__kmp_affinity_verbose || (__kmp_affinity_warnings \
2756  && (__kmp_affinity_type != affinity_none))) { \
2757  KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2758  } \
2759  } \
2760  else { \
2761  ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2762  } \
2763  }
2764 
2765 
2766 //
2767 // Re-parse the proclist (for the explicit affinity type), and form the list
2768 // of affinity newMasks indexed by gtid.
2769 //
2770 static void
2771 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2772  unsigned int *out_numMasks, const char *proclist,
2773  kmp_affin_mask_t *osId2Mask, int maxOsId)
2774 {
2775  const char *scan = proclist;
2776  const char *next = proclist;
2777 
2778  //
2779  // We use malloc() for the temporary mask vector,
2780  // so that we can use realloc() to extend it.
2781  //
2782  numNewMasks = 2;
2783  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
2784  * __kmp_affin_mask_size);
2785  nextNewMask = 0;
2786  kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate(
2787  __kmp_affin_mask_size);
2788  int setSize = 0;
2789 
2790  for (;;) {
2791  int start, end, stride;
2792 
2793  SKIP_WS(scan);
2794  next = scan;
2795  if (*next == '\0') {
2796  break;
2797  }
2798 
2799  if (*next == '{') {
2800  int num;
2801  setSize = 0;
2802  next++; // skip '{'
2803  SKIP_WS(next);
2804  scan = next;
2805 
2806  //
2807  // Read the first integer in the set.
2808  //
2809  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2810  "bad proclist");
2811  SKIP_DIGITS(next);
2812  num = __kmp_str_to_int(scan, *next);
2813  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2814 
2815  //
2816  // Copy the mask for that osId to the sum (union) mask.
2817  //
2818  if ((num > maxOsId) ||
2819  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2820  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2821  && (__kmp_affinity_type != affinity_none))) {
2822  KMP_WARNING(AffIgnoreInvalidProcID, num);
2823  }
2824  KMP_CPU_ZERO(sumMask);
2825  }
2826  else {
2827  KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2828  setSize = 1;
2829  }
2830 
2831  for (;;) {
2832  //
2833  // Check for end of set.
2834  //
2835  SKIP_WS(next);
2836  if (*next == '}') {
2837  next++; // skip '}'
2838  break;
2839  }
2840 
2841  //
2842  // Skip optional comma.
2843  //
2844  if (*next == ',') {
2845  next++;
2846  }
2847  SKIP_WS(next);
2848 
2849  //
2850  // Read the next integer in the set.
2851  //
2852  scan = next;
2853  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2854  "bad explicit proc list");
2855 
2856  SKIP_DIGITS(next);
2857  num = __kmp_str_to_int(scan, *next);
2858  KMP_ASSERT2(num >= 0, "bad explicit proc list");
2859 
2860  //
2861  // Add the mask for that osId to the sum mask.
2862  //
2863  if ((num > maxOsId) ||
2864  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2865  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
2866  && (__kmp_affinity_type != affinity_none))) {
2867  KMP_WARNING(AffIgnoreInvalidProcID, num);
2868  }
2869  }
2870  else {
2871  KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2872  setSize++;
2873  }
2874  }
2875  if (setSize > 0) {
2876  ADD_MASK(sumMask);
2877  }
2878 
2879  SKIP_WS(next);
2880  if (*next == ',') {
2881  next++;
2882  }
2883  scan = next;
2884  continue;
2885  }
2886 
2887  //
2888  // Read the first integer.
2889  //
2890  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2891  SKIP_DIGITS(next);
2892  start = __kmp_str_to_int(scan, *next);
2893  KMP_ASSERT2(start >= 0, "bad explicit proc list");
2894  SKIP_WS(next);
2895 
2896  //
2897  // If this isn't a range, then add a mask to the list and go on.
2898  //
2899  if (*next != '-') {
2900  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2901 
2902  //
2903  // Skip optional comma.
2904  //
2905  if (*next == ',') {
2906  next++;
2907  }
2908  scan = next;
2909  continue;
2910  }
2911 
2912  //
2913  // This is a range. Skip over the '-' and read in the 2nd int.
2914  //
2915  next++; // skip '-'
2916  SKIP_WS(next);
2917  scan = next;
2918  KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2919  SKIP_DIGITS(next);
2920  end = __kmp_str_to_int(scan, *next);
2921  KMP_ASSERT2(end >= 0, "bad explicit proc list");
2922 
2923  //
2924  // Check for a stride parameter
2925  //
2926  stride = 1;
2927  SKIP_WS(next);
2928  if (*next == ':') {
2929  //
2930  // A stride is specified. Skip over the ':" and read the 3rd int.
2931  //
2932  int sign = +1;
2933  next++; // skip ':'
2934  SKIP_WS(next);
2935  scan = next;
2936  if (*next == '-') {
2937  sign = -1;
2938  next++;
2939  SKIP_WS(next);
2940  scan = next;
2941  }
2942  KMP_ASSERT2((*next >= '0') && (*next <= '9'),
2943  "bad explicit proc list");
2944  SKIP_DIGITS(next);
2945  stride = __kmp_str_to_int(scan, *next);
2946  KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2947  stride *= sign;
2948  }
2949 
2950  //
2951  // Do some range checks.
2952  //
2953  KMP_ASSERT2(stride != 0, "bad explicit proc list");
2954  if (stride > 0) {
2955  KMP_ASSERT2(start <= end, "bad explicit proc list");
2956  }
2957  else {
2958  KMP_ASSERT2(start >= end, "bad explicit proc list");
2959  }
2960  KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2961 
2962  //
2963  // Add the mask for each OS proc # to the list.
2964  //
2965  if (stride > 0) {
2966  do {
2967  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2968  start += stride;
2969  } while (start <= end);
2970  }
2971  else {
2972  do {
2973  ADD_MASK_OSID(start, osId2Mask, maxOsId);
2974  start += stride;
2975  } while (start >= end);
2976  }
2977 
2978  //
2979  // Skip optional comma.
2980  //
2981  SKIP_WS(next);
2982  if (*next == ',') {
2983  next++;
2984  }
2985  scan = next;
2986  }
2987 
2988  *out_numMasks = nextNewMask;
2989  if (nextNewMask == 0) {
2990  *out_masks = NULL;
2991  KMP_INTERNAL_FREE(newMasks);
2992  return;
2993  }
2994  *out_masks
2995  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
2996  KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
2997  __kmp_free(sumMask);
2998  KMP_INTERNAL_FREE(newMasks);
2999 }
3000 
3001 
3002 # if OMP_40_ENABLED
3003 
3004 /*-----------------------------------------------------------------------------
3005 
3006 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3007 places. Again, Here is the grammar:
3008 
3009 place_list := place
3010 place_list := place , place_list
3011 place := num
3012 place := place : num
3013 place := place : num : signed
3014 place := { subplacelist }
3015 place := ! place // (lowest priority)
3016 subplace_list := subplace
3017 subplace_list := subplace , subplace_list
3018 subplace := num
3019 subplace := num : num
3020 subplace := num : num : signed
3021 signed := num
3022 signed := + signed
3023 signed := - signed
3024 
3025 -----------------------------------------------------------------------------*/
3026 
3027 static void
3028 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask,
3029  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3030 {
3031  const char *next;
3032 
3033  for (;;) {
3034  int start, count, stride, i;
3035 
3036  //
3037  // Read in the starting proc id
3038  //
3039  SKIP_WS(*scan);
3040  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3041  "bad explicit places list");
3042  next = *scan;
3043  SKIP_DIGITS(next);
3044  start = __kmp_str_to_int(*scan, *next);
3045  KMP_ASSERT(start >= 0);
3046  *scan = next;
3047 
3048  //
3049  // valid follow sets are ',' ':' and '}'
3050  //
3051  SKIP_WS(*scan);
3052  if (**scan == '}' || **scan == ',') {
3053  if ((start > maxOsId) ||
3054  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3055  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3056  && (__kmp_affinity_type != affinity_none))) {
3057  KMP_WARNING(AffIgnoreInvalidProcID, start);
3058  }
3059  }
3060  else {
3061  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3062  (*setSize)++;
3063  }
3064  if (**scan == '}') {
3065  break;
3066  }
3067  (*scan)++; // skip ','
3068  continue;
3069  }
3070  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3071  (*scan)++; // skip ':'
3072 
3073  //
3074  // Read count parameter
3075  //
3076  SKIP_WS(*scan);
3077  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3078  "bad explicit places list");
3079  next = *scan;
3080  SKIP_DIGITS(next);
3081  count = __kmp_str_to_int(*scan, *next);
3082  KMP_ASSERT(count >= 0);
3083  *scan = next;
3084 
3085  //
3086  // valid follow sets are ',' ':' and '}'
3087  //
3088  SKIP_WS(*scan);
3089  if (**scan == '}' || **scan == ',') {
3090  for (i = 0; i < count; i++) {
3091  if ((start > maxOsId) ||
3092  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3093  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3094  && (__kmp_affinity_type != affinity_none))) {
3095  KMP_WARNING(AffIgnoreInvalidProcID, start);
3096  }
3097  break; // don't proliferate warnings for large count
3098  }
3099  else {
3100  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3101  start++;
3102  (*setSize)++;
3103  }
3104  }
3105  if (**scan == '}') {
3106  break;
3107  }
3108  (*scan)++; // skip ','
3109  continue;
3110  }
3111  KMP_ASSERT2(**scan == ':', "bad explicit places list");
3112  (*scan)++; // skip ':'
3113 
3114  //
3115  // Read stride parameter
3116  //
3117  int sign = +1;
3118  for (;;) {
3119  SKIP_WS(*scan);
3120  if (**scan == '+') {
3121  (*scan)++; // skip '+'
3122  continue;
3123  }
3124  if (**scan == '-') {
3125  sign *= -1;
3126  (*scan)++; // skip '-'
3127  continue;
3128  }
3129  break;
3130  }
3131  SKIP_WS(*scan);
3132  KMP_ASSERT2((**scan >= '0') && (**scan <= '9'),
3133  "bad explicit places list");
3134  next = *scan;
3135  SKIP_DIGITS(next);
3136  stride = __kmp_str_to_int(*scan, *next);
3137  KMP_ASSERT(stride >= 0);
3138  *scan = next;
3139  stride *= sign;
3140 
3141  //
3142  // valid follow sets are ',' and '}'
3143  //
3144  SKIP_WS(*scan);
3145  if (**scan == '}' || **scan == ',') {
3146  for (i = 0; i < count; i++) {
3147  if ((start > maxOsId) ||
3148  (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3149  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3150  && (__kmp_affinity_type != affinity_none))) {
3151  KMP_WARNING(AffIgnoreInvalidProcID, start);
3152  }
3153  break; // don't proliferate warnings for large count
3154  }
3155  else {
3156  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3157  start += stride;
3158  (*setSize)++;
3159  }
3160  }
3161  if (**scan == '}') {
3162  break;
3163  }
3164  (*scan)++; // skip ','
3165  continue;
3166  }
3167 
3168  KMP_ASSERT2(0, "bad explicit places list");
3169  }
3170 }
3171 
3172 
3173 static void
3174 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3175  int maxOsId, kmp_affin_mask_t *tempMask, int *setSize)
3176 {
3177  const char *next;
3178 
3179  //
3180  // valid follow sets are '{' '!' and num
3181  //
3182  SKIP_WS(*scan);
3183  if (**scan == '{') {
3184  (*scan)++; // skip '{'
3185  __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask,
3186  setSize);
3187  KMP_ASSERT2(**scan == '}', "bad explicit places list");
3188  (*scan)++; // skip '}'
3189  }
3190  else if (**scan == '!') {
3191  __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3192  KMP_CPU_COMPLEMENT(tempMask);
3193  (*scan)++; // skip '!'
3194  }
3195  else if ((**scan >= '0') && (**scan <= '9')) {
3196  next = *scan;
3197  SKIP_DIGITS(next);
3198  int num = __kmp_str_to_int(*scan, *next);
3199  KMP_ASSERT(num >= 0);
3200  if ((num > maxOsId) ||
3201  (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3202  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3203  && (__kmp_affinity_type != affinity_none))) {
3204  KMP_WARNING(AffIgnoreInvalidProcID, num);
3205  }
3206  }
3207  else {
3208  KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3209  (*setSize)++;
3210  }
3211  *scan = next; // skip num
3212  }
3213  else {
3214  KMP_ASSERT2(0, "bad explicit places list");
3215  }
3216 }
3217 
3218 
3219 //static void
3220 void
3221 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3222  unsigned int *out_numMasks, const char *placelist,
3223  kmp_affin_mask_t *osId2Mask, int maxOsId)
3224 {
3225  const char *scan = placelist;
3226  const char *next = placelist;
3227 
3228  numNewMasks = 2;
3229  newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks
3230  * __kmp_affin_mask_size);
3231  nextNewMask = 0;
3232 
3233  kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate(
3234  __kmp_affin_mask_size);
3235  KMP_CPU_ZERO(tempMask);
3236  int setSize = 0;
3237 
3238  for (;;) {
3239  __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3240 
3241  //
3242  // valid follow sets are ',' ':' and EOL
3243  //
3244  SKIP_WS(scan);
3245  if (*scan == '\0' || *scan == ',') {
3246  if (setSize > 0) {
3247  ADD_MASK(tempMask);
3248  }
3249  KMP_CPU_ZERO(tempMask);
3250  setSize = 0;
3251  if (*scan == '\0') {
3252  break;
3253  }
3254  scan++; // skip ','
3255  continue;
3256  }
3257 
3258  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3259  scan++; // skip ':'
3260 
3261  //
3262  // Read count parameter
3263  //
3264  SKIP_WS(scan);
3265  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3266  "bad explicit places list");
3267  next = scan;
3268  SKIP_DIGITS(next);
3269  int count = __kmp_str_to_int(scan, *next);
3270  KMP_ASSERT(count >= 0);
3271  scan = next;
3272 
3273  //
3274  // valid follow sets are ',' ':' and EOL
3275  //
3276  SKIP_WS(scan);
3277  int stride;
3278  if (*scan == '\0' || *scan == ',') {
3279  stride = +1;
3280  }
3281  else {
3282  KMP_ASSERT2(*scan == ':', "bad explicit places list");
3283  scan++; // skip ':'
3284 
3285  //
3286  // Read stride parameter
3287  //
3288  int sign = +1;
3289  for (;;) {
3290  SKIP_WS(scan);
3291  if (*scan == '+') {
3292  scan++; // skip '+'
3293  continue;
3294  }
3295  if (*scan == '-') {
3296  sign *= -1;
3297  scan++; // skip '-'
3298  continue;
3299  }
3300  break;
3301  }
3302  SKIP_WS(scan);
3303  KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
3304  "bad explicit places list");
3305  next = scan;
3306  SKIP_DIGITS(next);
3307  stride = __kmp_str_to_int(scan, *next);
3308  KMP_DEBUG_ASSERT(stride >= 0);
3309  scan = next;
3310  stride *= sign;
3311  }
3312 
3313  if (stride > 0) {
3314  int i;
3315  for (i = 0; i < count; i++) {
3316  int j;
3317  if (setSize == 0) {
3318  break;
3319  }
3320  ADD_MASK(tempMask);
3321  setSize = 0;
3322  for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
3323  if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3324  KMP_CPU_CLR(j, tempMask);
3325  }
3326  else if ((j > maxOsId) ||
3327  (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3328  if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3329  && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3330  KMP_WARNING(AffIgnoreInvalidProcID, j);
3331  }
3332  KMP_CPU_CLR(j, tempMask);
3333  }
3334  else {
3335  KMP_CPU_SET(j, tempMask);
3336  setSize++;
3337  }
3338  }
3339  for (; j >= 0; j--) {
3340  KMP_CPU_CLR(j, tempMask);
3341  }
3342  }
3343  }
3344  else {
3345  int i;
3346  for (i = 0; i < count; i++) {
3347  int j;
3348  if (setSize == 0) {
3349  break;
3350  }
3351  ADD_MASK(tempMask);
3352  setSize = 0;
3353  for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
3354  j++) {
3355  if (! KMP_CPU_ISSET(j - stride, tempMask)) {
3356  KMP_CPU_CLR(j, tempMask);
3357  }
3358  else if ((j > maxOsId) ||
3359  (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
3360  if ((__kmp_affinity_verbose || (__kmp_affinity_warnings
3361  && (__kmp_affinity_type != affinity_none))) && i < count - 1) {
3362  KMP_WARNING(AffIgnoreInvalidProcID, j);
3363  }
3364  KMP_CPU_CLR(j, tempMask);
3365  }
3366  else {
3367  KMP_CPU_SET(j, tempMask);
3368  setSize++;
3369  }
3370  }
3371  for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
3372  KMP_CPU_CLR(j, tempMask);
3373  }
3374  }
3375  }
3376  KMP_CPU_ZERO(tempMask);
3377  setSize = 0;
3378 
3379  //
3380  // valid follow sets are ',' and EOL
3381  //
3382  SKIP_WS(scan);
3383  if (*scan == '\0') {
3384  break;
3385  }
3386  if (*scan == ',') {
3387  scan++; // skip ','
3388  continue;
3389  }
3390 
3391  KMP_ASSERT2(0, "bad explicit places list");
3392  }
3393 
3394  *out_numMasks = nextNewMask;
3395  if (nextNewMask == 0) {
3396  *out_masks = NULL;
3397  KMP_INTERNAL_FREE(newMasks);
3398  return;
3399  }
3400  *out_masks
3401  = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size);
3402  KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size);
3403  __kmp_free(tempMask);
3404  KMP_INTERNAL_FREE(newMasks);
3405 }
3406 
3407 # endif /* OMP_40_ENABLED */
3408 
3409 #undef ADD_MASK
3410 #undef ADD_MASK_OSID
3411 
3412 static void
3413 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
3414 {
3415  if ( __kmp_place_num_cores == 0 ) {
3416  if ( __kmp_place_num_threads_per_core == 0 ) {
3417  return; // no cores limiting actions requested, exit
3418  }
3419  __kmp_place_num_cores = nCoresPerPkg; // use all available cores
3420  }
3421  if ( !__kmp_affinity_uniform_topology() ) {
3422  KMP_WARNING( AffThrPlaceNonUniform );
3423  return; // don't support non-uniform topology
3424  }
3425  if ( depth != 3 ) {
3426  KMP_WARNING( AffThrPlaceNonThreeLevel );
3427  return; // don't support not-3-level topology
3428  }
3429  if ( __kmp_place_num_threads_per_core == 0 ) {
3430  __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts
3431  }
3432  if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) {
3433  KMP_WARNING( AffThrPlaceManyCores );
3434  return;
3435  }
3436 
3437  AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) *
3438  nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core);
3439  int i, j, k, n_old = 0, n_new = 0;
3440  for ( i = 0; i < nPackages; ++i ) {
3441  for ( j = 0; j < nCoresPerPkg; ++j ) {
3442  if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) {
3443  n_old += __kmp_nThreadsPerCore; // skip not-requested core
3444  } else {
3445  for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) {
3446  if ( k < __kmp_place_num_threads_per_core ) {
3447  newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location
3448  n_new++;
3449  }
3450  n_old++;
3451  }
3452  }
3453  }
3454  }
3455  nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg
3456  __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore
3457  __kmp_avail_proc = n_new; // correct avail_proc
3458  __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores
3459 
3460  __kmp_free( *pAddr );
3461  *pAddr = newAddr; // replace old topology with new one
3462 }
3463 
3464 
3465 static AddrUnsPair *address2os = NULL;
3466 static int * procarr = NULL;
3467 static int __kmp_aff_depth = 0;
3468 
3469 static void
3470 __kmp_aux_affinity_initialize(void)
3471 {
3472  if (__kmp_affinity_masks != NULL) {
3473  KMP_ASSERT(fullMask != NULL);
3474  return;
3475  }
3476 
3477  //
3478  // Create the "full" mask - this defines all of the processors that we
3479  // consider to be in the machine model. If respect is set, then it is
3480  // the initialization thread's affinity mask. Otherwise, it is all
3481  // processors that we know about on the machine.
3482  //
3483  if (fullMask == NULL) {
3484  fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size);
3485  }
3486  if (KMP_AFFINITY_CAPABLE()) {
3487  if (__kmp_affinity_respect_mask) {
3488  __kmp_get_system_affinity(fullMask, TRUE);
3489 
3490  //
3491  // Count the number of available processors.
3492  //
3493  unsigned i;
3494  __kmp_avail_proc = 0;
3495  for (i = 0; i < KMP_CPU_SETSIZE; ++i) {
3496  if (! KMP_CPU_ISSET(i, fullMask)) {
3497  continue;
3498  }
3499  __kmp_avail_proc++;
3500  }
3501  if (__kmp_avail_proc > __kmp_xproc) {
3502  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3503  && (__kmp_affinity_type != affinity_none))) {
3504  KMP_WARNING(ErrorInitializeAffinity);
3505  }
3506  __kmp_affinity_type = affinity_none;
3507  KMP_AFFINITY_DISABLE();
3508  return;
3509  }
3510  }
3511  else {
3512  __kmp_affinity_entire_machine_mask(fullMask);
3513  __kmp_avail_proc = __kmp_xproc;
3514  }
3515  }
3516 
3517  int depth = -1;
3518  kmp_i18n_id_t msg_id = kmp_i18n_null;
3519 
3520  //
3521  // For backward compatibility, setting KMP_CPUINFO_FILE =>
3522  // KMP_TOPOLOGY_METHOD=cpuinfo
3523  //
3524  if ((__kmp_cpuinfo_file != NULL) &&
3525  (__kmp_affinity_top_method == affinity_top_method_all)) {
3526  __kmp_affinity_top_method = affinity_top_method_cpuinfo;
3527  }
3528 
3529  if (__kmp_affinity_top_method == affinity_top_method_all) {
3530  //
3531  // In the default code path, errors are not fatal - we just try using
3532  // another method. We only emit a warning message if affinity is on,
3533  // or the verbose flag is set, an the nowarnings flag was not set.
3534  //
3535  const char *file_name = NULL;
3536  int line = 0;
3537 
3538 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3539 
3540  if (__kmp_affinity_verbose) {
3541  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
3542  }
3543 
3544  file_name = NULL;
3545  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3546  if (depth == 0) {
3547  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3548  KMP_ASSERT(address2os == NULL);
3549  return;
3550  }
3551 
3552  if (depth < 0) {
3553  if (__kmp_affinity_verbose) {
3554  if (msg_id != kmp_i18n_null) {
3555  KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
3556  KMP_I18N_STR(DecodingLegacyAPIC));
3557  }
3558  else {
3559  KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
3560  }
3561  }
3562 
3563  file_name = NULL;
3564  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3565  if (depth == 0) {
3566  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3567  KMP_ASSERT(address2os == NULL);
3568  return;
3569  }
3570  }
3571 
3572 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3573 
3574 # if KMP_OS_LINUX
3575 
3576  if (depth < 0) {
3577  if (__kmp_affinity_verbose) {
3578  if (msg_id != kmp_i18n_null) {
3579  KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
3580  }
3581  else {
3582  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
3583  }
3584  }
3585 
3586  FILE *f = fopen("/proc/cpuinfo", "r");
3587  if (f == NULL) {
3588  msg_id = kmp_i18n_str_CantOpenCpuinfo;
3589  }
3590  else {
3591  file_name = "/proc/cpuinfo";
3592  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3593  fclose(f);
3594  if (depth == 0) {
3595  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3596  KMP_ASSERT(address2os == NULL);
3597  return;
3598  }
3599  }
3600  }
3601 
3602 # endif /* KMP_OS_LINUX */
3603 
3604 # if KMP_GROUP_AFFINITY
3605 
3606  if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
3607  if (__kmp_affinity_verbose) {
3608  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3609  }
3610 
3611  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3612  KMP_ASSERT(depth != 0);
3613  }
3614 
3615 # endif /* KMP_GROUP_AFFINITY */
3616 
3617  if (depth < 0) {
3618  if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
3619  if (file_name == NULL) {
3620  KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
3621  }
3622  else if (line == 0) {
3623  KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
3624  }
3625  else {
3626  KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
3627  }
3628  }
3629  // FIXME - print msg if msg_id = kmp_i18n_null ???
3630 
3631  file_name = "";
3632  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3633  if (depth == 0) {
3634  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3635  KMP_ASSERT(address2os == NULL);
3636  return;
3637  }
3638  KMP_ASSERT(depth > 0);
3639  KMP_ASSERT(address2os != NULL);
3640  }
3641  }
3642 
3643  //
3644  // If the user has specified that a paricular topology discovery method
3645  // is to be used, then we abort if that method fails. The exception is
3646  // group affinity, which might have been implicitly set.
3647  //
3648 
3649 # if KMP_ARCH_X86 || KMP_ARCH_X86_64
3650 
3651  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
3652  if (__kmp_affinity_verbose) {
3653  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3654  KMP_I18N_STR(Decodingx2APIC));
3655  }
3656 
3657  depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
3658  if (depth == 0) {
3659  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3660  KMP_ASSERT(address2os == NULL);
3661  return;
3662  }
3663  if (depth < 0) {
3664  KMP_ASSERT(msg_id != kmp_i18n_null);
3665  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3666  }
3667  }
3668  else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
3669  if (__kmp_affinity_verbose) {
3670  KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
3671  KMP_I18N_STR(DecodingLegacyAPIC));
3672  }
3673 
3674  depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
3675  if (depth == 0) {
3676  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3677  KMP_ASSERT(address2os == NULL);
3678  return;
3679  }
3680  if (depth < 0) {
3681  KMP_ASSERT(msg_id != kmp_i18n_null);
3682  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3683  }
3684  }
3685 
3686 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3687 
3688  else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
3689  const char *filename;
3690  if (__kmp_cpuinfo_file != NULL) {
3691  filename = __kmp_cpuinfo_file;
3692  }
3693  else {
3694  filename = "/proc/cpuinfo";
3695  }
3696 
3697  if (__kmp_affinity_verbose) {
3698  KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
3699  }
3700 
3701  FILE *f = fopen(filename, "r");
3702  if (f == NULL) {
3703  int code = errno;
3704  if (__kmp_cpuinfo_file != NULL) {
3705  __kmp_msg(
3706  kmp_ms_fatal,
3707  KMP_MSG(CantOpenFileForReading, filename),
3708  KMP_ERR(code),
3709  KMP_HNT(NameComesFrom_CPUINFO_FILE),
3710  __kmp_msg_null
3711  );
3712  }
3713  else {
3714  __kmp_msg(
3715  kmp_ms_fatal,
3716  KMP_MSG(CantOpenFileForReading, filename),
3717  KMP_ERR(code),
3718  __kmp_msg_null
3719  );
3720  }
3721  }
3722  int line = 0;
3723  depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
3724  fclose(f);
3725  if (depth < 0) {
3726  KMP_ASSERT(msg_id != kmp_i18n_null);
3727  if (line > 0) {
3728  KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id));
3729  }
3730  else {
3731  KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
3732  }
3733  }
3734  if (__kmp_affinity_type == affinity_none) {
3735  KMP_ASSERT(depth == 0);
3736  KMP_ASSERT(address2os == NULL);
3737  return;
3738  }
3739  }
3740 
3741 # if KMP_GROUP_AFFINITY
3742 
3743  else if (__kmp_affinity_top_method == affinity_top_method_group) {
3744  if (__kmp_affinity_verbose) {
3745  KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
3746  }
3747 
3748  depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
3749  KMP_ASSERT(depth != 0);
3750  if (depth < 0) {
3751  KMP_ASSERT(msg_id != kmp_i18n_null);
3752  KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
3753  }
3754  }
3755 
3756 # endif /* KMP_GROUP_AFFINITY */
3757 
3758  else if (__kmp_affinity_top_method == affinity_top_method_flat) {
3759  if (__kmp_affinity_verbose) {
3760  KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
3761  }
3762 
3763  depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
3764  if (depth == 0) {
3765  KMP_ASSERT(__kmp_affinity_type == affinity_none);
3766  KMP_ASSERT(address2os == NULL);
3767  return;
3768  }
3769  // should not fail
3770  KMP_ASSERT(depth > 0);
3771  KMP_ASSERT(address2os != NULL);
3772  }
3773 
3774  if (address2os == NULL) {
3775  if (KMP_AFFINITY_CAPABLE()
3776  && (__kmp_affinity_verbose || (__kmp_affinity_warnings
3777  && (__kmp_affinity_type != affinity_none)))) {
3778  KMP_WARNING(ErrorInitializeAffinity);
3779  }
3780  __kmp_affinity_type = affinity_none;
3781  KMP_AFFINITY_DISABLE();
3782  return;
3783  }
3784 
3785  __kmp_apply_thread_places(&address2os, depth);
3786 
3787  //
3788  // Create the table of masks, indexed by thread Id.
3789  //
3790  unsigned maxIndex;
3791  unsigned numUnique;
3792  kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
3793  address2os, __kmp_avail_proc);
3794  if (__kmp_affinity_gran_levels == 0) {
3795  KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
3796  }
3797 
3798  //
3799  // Set the childNums vector in all Address objects. This must be done
3800  // before we can sort using __kmp_affinity_cmp_Address_child_num(),
3801  // which takes into account the setting of __kmp_affinity_compact.
3802  //
3803  __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
3804 
3805  switch (__kmp_affinity_type) {
3806 
3807  case affinity_explicit:
3808  KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
3809 # if OMP_40_ENABLED
3810  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
3811 # endif
3812  {
3813  __kmp_affinity_process_proclist(&__kmp_affinity_masks,
3814  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3815  maxIndex);
3816  }
3817 # if OMP_40_ENABLED
3818  else {
3819  __kmp_affinity_process_placelist(&__kmp_affinity_masks,
3820  &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask,
3821  maxIndex);
3822  }
3823 # endif
3824  if (__kmp_affinity_num_masks == 0) {
3825  if (__kmp_affinity_verbose || (__kmp_affinity_warnings
3826  && (__kmp_affinity_type != affinity_none))) {
3827  KMP_WARNING(AffNoValidProcID);
3828  }
3829  __kmp_affinity_type = affinity_none;
3830  return;
3831  }
3832  break;
3833 
3834  //
3835  // The other affinity types rely on sorting the Addresses according
3836  // to some permutation of the machine topology tree. Set
3837  // __kmp_affinity_compact and __kmp_affinity_offset appropriately,
3838  // then jump to a common code fragment to do the sort and create
3839  // the array of affinity masks.
3840  //
3841 
3842  case affinity_logical:
3843  __kmp_affinity_compact = 0;
3844  if (__kmp_affinity_offset) {
3845  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3846  % __kmp_avail_proc;
3847  }
3848  goto sortAddresses;
3849 
3850  case affinity_physical:
3851  if (__kmp_nThreadsPerCore > 1) {
3852  __kmp_affinity_compact = 1;
3853  if (__kmp_affinity_compact >= depth) {
3854  __kmp_affinity_compact = 0;
3855  }
3856  } else {
3857  __kmp_affinity_compact = 0;
3858  }
3859  if (__kmp_affinity_offset) {
3860  __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset
3861  % __kmp_avail_proc;
3862  }
3863  goto sortAddresses;
3864 
3865  case affinity_scatter:
3866  if (__kmp_affinity_compact >= depth) {
3867  __kmp_affinity_compact = 0;
3868  }
3869  else {
3870  __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
3871  }
3872  goto sortAddresses;
3873 
3874  case affinity_compact:
3875  if (__kmp_affinity_compact >= depth) {
3876  __kmp_affinity_compact = depth - 1;
3877  }
3878  goto sortAddresses;
3879 
3880  case affinity_balanced:
3881  // Balanced works only for the case of a single package
3882  if( nPackages > 1 ) {
3883  if( __kmp_affinity_verbose || __kmp_affinity_warnings ) {
3884  KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" );
3885  }
3886  __kmp_affinity_type = affinity_none;
3887  return;
3888  } else if( __kmp_affinity_uniform_topology() ) {
3889  break;
3890  } else { // Non-uniform topology
3891 
3892  // Save the depth for further usage
3893  __kmp_aff_depth = depth;
3894 
3895  // Number of hyper threads per core in HT machine
3896  int nth_per_core = __kmp_nThreadsPerCore;
3897 
3898  int core_level;
3899  if( nth_per_core > 1 ) {
3900  core_level = depth - 2;
3901  } else {
3902  core_level = depth - 1;
3903  }
3904  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
3905  int nproc = nth_per_core * ncores;
3906 
3907  procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
3908  for( int i = 0; i < nproc; i++ ) {
3909  procarr[ i ] = -1;
3910  }
3911 
3912  for( int i = 0; i < __kmp_avail_proc; i++ ) {
3913  int proc = address2os[ i ].second;
3914  // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread.
3915  // If there is only one thread per core then depth == 2: level 0 - package,
3916  // level 1 - core.
3917  int level = depth - 1;
3918 
3919  // __kmp_nth_per_core == 1
3920  int thread = 0;
3921  int core = address2os[ i ].first.labels[ level ];
3922  // If the thread level exists, that is we have more than one thread context per core
3923  if( nth_per_core > 1 ) {
3924  thread = address2os[ i ].first.labels[ level ] % nth_per_core;
3925  core = address2os[ i ].first.labels[ level - 1 ];
3926  }
3927  procarr[ core * nth_per_core + thread ] = proc;
3928  }
3929 
3930  break;
3931  }
3932 
3933  sortAddresses:
3934  //
3935  // Allocate the gtid->affinity mask table.
3936  //
3937  if (__kmp_affinity_dups) {
3938  __kmp_affinity_num_masks = __kmp_avail_proc;
3939  }
3940  else {
3941  __kmp_affinity_num_masks = numUnique;
3942  }
3943 
3944 # if OMP_40_ENABLED
3945  if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
3946  && ( __kmp_affinity_num_places > 0 )
3947  && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) {
3948  __kmp_affinity_num_masks = __kmp_affinity_num_places;
3949  }
3950 # endif
3951 
3952  __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate(
3953  __kmp_affinity_num_masks * __kmp_affin_mask_size);
3954 
3955  //
3956  // Sort the address2os table according to the current setting of
3957  // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
3958  //
3959  qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
3960  __kmp_affinity_cmp_Address_child_num);
3961  {
3962  int i;
3963  unsigned j;
3964  for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
3965  if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) {
3966  continue;
3967  }
3968  unsigned osId = address2os[i].second;
3969  kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
3970  kmp_affin_mask_t *dest
3971  = KMP_CPU_INDEX(__kmp_affinity_masks, j);
3972  KMP_ASSERT(KMP_CPU_ISSET(osId, src));
3973  KMP_CPU_COPY(dest, src);
3974  if (++j >= __kmp_affinity_num_masks) {
3975  break;
3976  }
3977  }
3978  KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
3979  }
3980  break;
3981 
3982  default:
3983  KMP_ASSERT2(0, "Unexpected affinity setting");
3984  }
3985 
3986  __kmp_free(osId2Mask);
3987  machine_hierarchy.init(address2os, __kmp_avail_proc);
3988 }
3989 
3990 
3991 void
3992 __kmp_affinity_initialize(void)
3993 {
3994  //
3995  // Much of the code above was written assumming that if a machine was not
3996  // affinity capable, then __kmp_affinity_type == affinity_none. We now
3997  // explicitly represent this as __kmp_affinity_type == affinity_disabled.
3998  //
3999  // There are too many checks for __kmp_affinity_type == affinity_none
4000  // in this code. Instead of trying to change them all, check if
4001  // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4002  // affinity_none, call the real initialization routine, then restore
4003  // __kmp_affinity_type to affinity_disabled.
4004  //
4005  int disabled = (__kmp_affinity_type == affinity_disabled);
4006  if (! KMP_AFFINITY_CAPABLE()) {
4007  KMP_ASSERT(disabled);
4008  }
4009  if (disabled) {
4010  __kmp_affinity_type = affinity_none;
4011  }
4012  __kmp_aux_affinity_initialize();
4013  if (disabled) {
4014  __kmp_affinity_type = affinity_disabled;
4015  }
4016 }
4017 
4018 
4019 void
4020 __kmp_affinity_uninitialize(void)
4021 {
4022  if (__kmp_affinity_masks != NULL) {
4023  __kmp_free(__kmp_affinity_masks);
4024  __kmp_affinity_masks = NULL;
4025  }
4026  if (fullMask != NULL) {
4027  KMP_CPU_FREE(fullMask);
4028  fullMask = NULL;
4029  }
4030  __kmp_affinity_num_masks = 0;
4031 # if OMP_40_ENABLED
4032  __kmp_affinity_num_places = 0;
4033 # endif
4034  if (__kmp_affinity_proclist != NULL) {
4035  __kmp_free(__kmp_affinity_proclist);
4036  __kmp_affinity_proclist = NULL;
4037  }
4038  if( address2os != NULL ) {
4039  __kmp_free( address2os );
4040  address2os = NULL;
4041  }
4042  if( procarr != NULL ) {
4043  __kmp_free( procarr );
4044  procarr = NULL;
4045  }
4046 }
4047 
4048 
4049 void
4050 __kmp_affinity_set_init_mask(int gtid, int isa_root)
4051 {
4052  if (! KMP_AFFINITY_CAPABLE()) {
4053  return;
4054  }
4055 
4056  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4057  if (th->th.th_affin_mask == NULL) {
4058  KMP_CPU_ALLOC(th->th.th_affin_mask);
4059  }
4060  else {
4061  KMP_CPU_ZERO(th->th.th_affin_mask);
4062  }
4063 
4064  //
4065  // Copy the thread mask to the kmp_info_t strucuture.
4066  // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one
4067  // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask
4068  // is set, then the full mask is the same as the mask of the initialization
4069  // thread.
4070  //
4071  kmp_affin_mask_t *mask;
4072  int i;
4073 
4074 # if OMP_40_ENABLED
4075  if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
4076 # endif
4077  {
4078  if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced)
4079  ) {
4080 # if KMP_GROUP_AFFINITY
4081  if (__kmp_num_proc_groups > 1) {
4082  return;
4083  }
4084 # endif
4085  KMP_ASSERT(fullMask != NULL);
4086  i = KMP_PLACE_ALL;
4087  mask = fullMask;
4088  }
4089  else {
4090  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4091  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4092  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4093  }
4094  }
4095 # if OMP_40_ENABLED
4096  else {
4097  if ((! isa_root)
4098  || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4099 # if KMP_GROUP_AFFINITY
4100  if (__kmp_num_proc_groups > 1) {
4101  return;
4102  }
4103 # endif
4104  KMP_ASSERT(fullMask != NULL);
4105  i = KMP_PLACE_ALL;
4106  mask = fullMask;
4107  }
4108  else {
4109  //
4110  // int i = some hash function or just a counter that doesn't
4111  // always start at 0. Use gtid for now.
4112  //
4113  KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 );
4114  i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4115  mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4116  }
4117  }
4118 # endif
4119 
4120 # if OMP_40_ENABLED
4121  th->th.th_current_place = i;
4122  if (isa_root) {
4123  th->th.th_new_place = i;
4124  th->th.th_first_place = 0;
4125  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4126  }
4127 
4128  if (i == KMP_PLACE_ALL) {
4129  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4130  gtid));
4131  }
4132  else {
4133  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4134  gtid, i));
4135  }
4136 # else
4137  if (i == -1) {
4138  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n",
4139  gtid));
4140  }
4141  else {
4142  KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
4143  gtid, i));
4144  }
4145 # endif /* OMP_40_ENABLED */
4146 
4147  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4148 
4149  if (__kmp_affinity_verbose) {
4150  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4151  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4152  th->th.th_affin_mask);
4153  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
4154  buf);
4155  }
4156 
4157 # if KMP_OS_WINDOWS
4158  //
4159  // On Windows* OS, the process affinity mask might have changed.
4160  // If the user didn't request affinity and this call fails,
4161  // just continue silently. See CQ171393.
4162  //
4163  if ( __kmp_affinity_type == affinity_none ) {
4164  __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4165  }
4166  else
4167 # endif
4168  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4169 }
4170 
4171 
4172 # if OMP_40_ENABLED
4173 
4174 void
4175 __kmp_affinity_set_place(int gtid)
4176 {
4177  int retval;
4178 
4179  if (! KMP_AFFINITY_CAPABLE()) {
4180  return;
4181  }
4182 
4183  kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4184 
4185  KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n",
4186  gtid, th->th.th_new_place, th->th.th_current_place));
4187 
4188  //
4189  // Check that the new place is within this thread's partition.
4190  //
4191  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4192  KMP_ASSERT(th->th.th_new_place >= 0);
4193  KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4194  if (th->th.th_first_place <= th->th.th_last_place) {
4195  KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
4196  && (th->th.th_new_place <= th->th.th_last_place));
4197  }
4198  else {
4199  KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
4200  || (th->th.th_new_place >= th->th.th_last_place));
4201  }
4202 
4203  //
4204  // Copy the thread mask to the kmp_info_t strucuture,
4205  // and set this thread's affinity.
4206  //
4207  kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks,
4208  th->th.th_new_place);
4209  KMP_CPU_COPY(th->th.th_affin_mask, mask);
4210  th->th.th_current_place = th->th.th_new_place;
4211 
4212  if (__kmp_affinity_verbose) {
4213  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4214  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4215  th->th.th_affin_mask);
4216  KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4217  gtid, buf);
4218  }
4219  __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4220 }
4221 
4222 # endif /* OMP_40_ENABLED */
4223 
4224 
4225 int
4226 __kmp_aux_set_affinity(void **mask)
4227 {
4228  int gtid;
4229  kmp_info_t *th;
4230  int retval;
4231 
4232  if (! KMP_AFFINITY_CAPABLE()) {
4233  return -1;
4234  }
4235 
4236  gtid = __kmp_entry_gtid();
4237  KA_TRACE(1000, ;{
4238  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4239  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4240  (kmp_affin_mask_t *)(*mask));
4241  __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n",
4242  gtid, buf);
4243  });
4244 
4245  if (__kmp_env_consistency_check) {
4246  if ((mask == NULL) || (*mask == NULL)) {
4247  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4248  }
4249  else {
4250  unsigned proc;
4251  int num_procs = 0;
4252 
4253  for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) {
4254  if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4255  continue;
4256  }
4257  num_procs++;
4258  if (! KMP_CPU_ISSET(proc, fullMask)) {
4259  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4260  break;
4261  }
4262  }
4263  if (num_procs == 0) {
4264  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4265  }
4266 
4267 # if KMP_GROUP_AFFINITY
4268  if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4269  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4270  }
4271 # endif /* KMP_GROUP_AFFINITY */
4272 
4273  }
4274  }
4275 
4276  th = __kmp_threads[gtid];
4277  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4278  retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4279  if (retval == 0) {
4280  KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4281  }
4282 
4283 # if OMP_40_ENABLED
4284  th->th.th_current_place = KMP_PLACE_UNDEFINED;
4285  th->th.th_new_place = KMP_PLACE_UNDEFINED;
4286  th->th.th_first_place = 0;
4287  th->th.th_last_place = __kmp_affinity_num_masks - 1;
4288 
4289  //
4290  // Turn off 4.0 affinity for the current tread at this parallel level.
4291  //
4292  th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4293 # endif
4294 
4295  return retval;
4296 }
4297 
4298 
4299 int
4300 __kmp_aux_get_affinity(void **mask)
4301 {
4302  int gtid;
4303  int retval;
4304  kmp_info_t *th;
4305 
4306  if (! KMP_AFFINITY_CAPABLE()) {
4307  return -1;
4308  }
4309 
4310  gtid = __kmp_entry_gtid();
4311  th = __kmp_threads[gtid];
4312  KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4313 
4314  KA_TRACE(1000, ;{
4315  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4316  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4317  th->th.th_affin_mask);
4318  __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf);
4319  });
4320 
4321  if (__kmp_env_consistency_check) {
4322  if ((mask == NULL) || (*mask == NULL)) {
4323  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4324  }
4325  }
4326 
4327 # if !KMP_OS_WINDOWS
4328 
4329  retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4330  KA_TRACE(1000, ;{
4331  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4332  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4333  (kmp_affin_mask_t *)(*mask));
4334  __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf);
4335  });
4336  return retval;
4337 
4338 # else
4339 
4340  KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4341  return 0;
4342 
4343 # endif /* KMP_OS_WINDOWS */
4344 
4345 }
4346 
4347 int
4348 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
4349 {
4350  int retval;
4351 
4352  if (! KMP_AFFINITY_CAPABLE()) {
4353  return -1;
4354  }
4355 
4356  KA_TRACE(1000, ;{
4357  int gtid = __kmp_entry_gtid();
4358  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4359  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4360  (kmp_affin_mask_t *)(*mask));
4361  __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n",
4362  proc, gtid, buf);
4363  });
4364 
4365  if (__kmp_env_consistency_check) {
4366  if ((mask == NULL) || (*mask == NULL)) {
4367  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4368  }
4369  }
4370 
4371  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4372  return -1;
4373  }
4374  if (! KMP_CPU_ISSET(proc, fullMask)) {
4375  return -2;
4376  }
4377 
4378  KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
4379  return 0;
4380 }
4381 
4382 
4383 int
4384 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask)
4385 {
4386  int retval;
4387 
4388  if (! KMP_AFFINITY_CAPABLE()) {
4389  return -1;
4390  }
4391 
4392  KA_TRACE(1000, ;{
4393  int gtid = __kmp_entry_gtid();
4394  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4395  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4396  (kmp_affin_mask_t *)(*mask));
4397  __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n",
4398  proc, gtid, buf);
4399  });
4400 
4401  if (__kmp_env_consistency_check) {
4402  if ((mask == NULL) || (*mask == NULL)) {
4403  KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
4404  }
4405  }
4406 
4407  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4408  return -1;
4409  }
4410  if (! KMP_CPU_ISSET(proc, fullMask)) {
4411  return -2;
4412  }
4413 
4414  KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
4415  return 0;
4416 }
4417 
4418 
4419 int
4420 __kmp_aux_get_affinity_mask_proc(int proc, void **mask)
4421 {
4422  int retval;
4423 
4424  if (! KMP_AFFINITY_CAPABLE()) {
4425  return -1;
4426  }
4427 
4428  KA_TRACE(1000, ;{
4429  int gtid = __kmp_entry_gtid();
4430  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4431  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4432  (kmp_affin_mask_t *)(*mask));
4433  __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n",
4434  proc, gtid, buf);
4435  });
4436 
4437  if (__kmp_env_consistency_check) {
4438  if ((mask == NULL) || (*mask == NULL)) {
4439  KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
4440  }
4441  }
4442 
4443  if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) {
4444  return 0;
4445  }
4446  if (! KMP_CPU_ISSET(proc, fullMask)) {
4447  return 0;
4448  }
4449 
4450  return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
4451 }
4452 
4453 
4454 // Dynamic affinity settings - Affinity balanced
4455 void __kmp_balanced_affinity( int tid, int nthreads )
4456 {
4457  if( __kmp_affinity_uniform_topology() ) {
4458  int coreID;
4459  int threadID;
4460  // Number of hyper threads per core in HT machine
4461  int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
4462  // Number of cores
4463  int ncores = __kmp_ncores;
4464  // How many threads will be bound to each core
4465  int chunk = nthreads / ncores;
4466  // How many cores will have an additional thread bound to it - "big cores"
4467  int big_cores = nthreads % ncores;
4468  // Number of threads on the big cores
4469  int big_nth = ( chunk + 1 ) * big_cores;
4470  if( tid < big_nth ) {
4471  coreID = tid / (chunk + 1 );
4472  threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ;
4473  } else { //tid >= big_nth
4474  coreID = ( tid - big_cores ) / chunk;
4475  threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ;
4476  }
4477 
4478  KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
4479  "Illegal set affinity operation when not capable");
4480 
4481  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4482  KMP_CPU_ZERO(mask);
4483 
4484  // Granularity == thread
4485  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4486  int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second;
4487  KMP_CPU_SET( osID, mask);
4488  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4489  for( int i = 0; i < __kmp_nth_per_core; i++ ) {
4490  int osID;
4491  osID = address2os[ coreID * __kmp_nth_per_core + i ].second;
4492  KMP_CPU_SET( osID, mask);
4493  }
4494  }
4495  if (__kmp_affinity_verbose) {
4496  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4497  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4498  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4499  tid, buf);
4500  }
4501  __kmp_set_system_affinity( mask, TRUE );
4502  } else { // Non-uniform topology
4503 
4504  kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size);
4505  KMP_CPU_ZERO(mask);
4506 
4507  // Number of hyper threads per core in HT machine
4508  int nth_per_core = __kmp_nThreadsPerCore;
4509  int core_level;
4510  if( nth_per_core > 1 ) {
4511  core_level = __kmp_aff_depth - 2;
4512  } else {
4513  core_level = __kmp_aff_depth - 1;
4514  }
4515 
4516  // Number of cores - maximum value; it does not count trail cores with 0 processors
4517  int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1;
4518 
4519  // For performance gain consider the special case nthreads == __kmp_avail_proc
4520  if( nthreads == __kmp_avail_proc ) {
4521  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4522  int osID = address2os[ tid ].second;
4523  KMP_CPU_SET( osID, mask);
4524  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4525  int coreID = address2os[ tid ].first.labels[ core_level ];
4526  // We'll count found osIDs for the current core; they can be not more than nth_per_core;
4527  // since the address2os is sortied we can break when cnt==nth_per_core
4528  int cnt = 0;
4529  for( int i = 0; i < __kmp_avail_proc; i++ ) {
4530  int osID = address2os[ i ].second;
4531  int core = address2os[ i ].first.labels[ core_level ];
4532  if( core == coreID ) {
4533  KMP_CPU_SET( osID, mask);
4534  cnt++;
4535  if( cnt == nth_per_core ) {
4536  break;
4537  }
4538  }
4539  }
4540  }
4541  } else if( nthreads <= __kmp_ncores ) {
4542 
4543  int core = 0;
4544  for( int i = 0; i < ncores; i++ ) {
4545  // Check if this core from procarr[] is in the mask
4546  int in_mask = 0;
4547  for( int j = 0; j < nth_per_core; j++ ) {
4548  if( procarr[ i * nth_per_core + j ] != - 1 ) {
4549  in_mask = 1;
4550  break;
4551  }
4552  }
4553  if( in_mask ) {
4554  if( tid == core ) {
4555  for( int j = 0; j < nth_per_core; j++ ) {
4556  int osID = procarr[ i * nth_per_core + j ];
4557  if( osID != -1 ) {
4558  KMP_CPU_SET( osID, mask );
4559  // For granularity=thread it is enough to set the first available osID for this core
4560  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4561  break;
4562  }
4563  }
4564  }
4565  break;
4566  } else {
4567  core++;
4568  }
4569  }
4570  }
4571 
4572  } else { // nthreads > __kmp_ncores
4573 
4574  // Array to save the number of processors at each core
4575  int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores);
4576  // Array to save the number of cores with "x" available processors;
4577  int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4578  // Array to save the number of cores with # procs from x to nth_per_core
4579  int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1));
4580 
4581  for( int i = 0; i <= nth_per_core; i++ ) {
4582  ncores_with_x_procs[ i ] = 0;
4583  ncores_with_x_to_max_procs[ i ] = 0;
4584  }
4585 
4586  for( int i = 0; i < ncores; i++ ) {
4587  int cnt = 0;
4588  for( int j = 0; j < nth_per_core; j++ ) {
4589  if( procarr[ i * nth_per_core + j ] != -1 ) {
4590  cnt++;
4591  }
4592  }
4593  nproc_at_core[ i ] = cnt;
4594  ncores_with_x_procs[ cnt ]++;
4595  }
4596 
4597  for( int i = 0; i <= nth_per_core; i++ ) {
4598  for( int j = i; j <= nth_per_core; j++ ) {
4599  ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ];
4600  }
4601  }
4602 
4603  // Max number of processors
4604  int nproc = nth_per_core * ncores;
4605  // An array to keep number of threads per each context
4606  int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc );
4607  for( int i = 0; i < nproc; i++ ) {
4608  newarr[ i ] = 0;
4609  }
4610 
4611  int nth = nthreads;
4612  int flag = 0;
4613  while( nth > 0 ) {
4614  for( int j = 1; j <= nth_per_core; j++ ) {
4615  int cnt = ncores_with_x_to_max_procs[ j ];
4616  for( int i = 0; i < ncores; i++ ) {
4617  // Skip the core with 0 processors
4618  if( nproc_at_core[ i ] == 0 ) {
4619  continue;
4620  }
4621  for( int k = 0; k < nth_per_core; k++ ) {
4622  if( procarr[ i * nth_per_core + k ] != -1 ) {
4623  if( newarr[ i * nth_per_core + k ] == 0 ) {
4624  newarr[ i * nth_per_core + k ] = 1;
4625  cnt--;
4626  nth--;
4627  break;
4628  } else {
4629  if( flag != 0 ) {
4630  newarr[ i * nth_per_core + k ] ++;
4631  cnt--;
4632  nth--;
4633  break;
4634  }
4635  }
4636  }
4637  }
4638  if( cnt == 0 || nth == 0 ) {
4639  break;
4640  }
4641  }
4642  if( nth == 0 ) {
4643  break;
4644  }
4645  }
4646  flag = 1;
4647  }
4648  int sum = 0;
4649  for( int i = 0; i < nproc; i++ ) {
4650  sum += newarr[ i ];
4651  if( sum > tid ) {
4652  // Granularity == thread
4653  if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) {
4654  int osID = procarr[ i ];
4655  KMP_CPU_SET( osID, mask);
4656  } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core
4657  int coreID = i / nth_per_core;
4658  for( int ii = 0; ii < nth_per_core; ii++ ) {
4659  int osID = procarr[ coreID * nth_per_core + ii ];
4660  if( osID != -1 ) {
4661  KMP_CPU_SET( osID, mask);
4662  }
4663  }
4664  }
4665  break;
4666  }
4667  }
4668  __kmp_free( newarr );
4669  }
4670 
4671  if (__kmp_affinity_verbose) {
4672  char buf[KMP_AFFIN_MASK_PRINT_LEN];
4673  __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
4674  KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4675  tid, buf);
4676  }
4677  __kmp_set_system_affinity( mask, TRUE );
4678  }
4679 }
4680 
4681 #else
4682  // affinity not supported
4683 
4684 static const kmp_uint32 noaff_maxLevels=7;
4685 kmp_uint32 noaff_skipPerLevel[noaff_maxLevels];
4686 kmp_uint32 noaff_depth;
4687 kmp_uint8 noaff_leaf_kids;
4688 kmp_int8 noaff_uninitialized=1;
4689 
4690 void noaff_init(int nprocs)
4691 {
4692  kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2);
4693  if (result == 0) return; // Already initialized
4694  else if (result == 2) { // Someone else is initializing
4695  while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE();
4696  return;
4697  }
4698  KMP_DEBUG_ASSERT(result==1);
4699 
4700  kmp_uint32 numPerLevel[noaff_maxLevels];
4701  noaff_depth = 1;
4702  for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
4703  numPerLevel[i] = 1;
4704  noaff_skipPerLevel[i] = 1;
4705  }
4706 
4707  numPerLevel[0] = 4;
4708  numPerLevel[1] = nprocs/4;
4709  if (nprocs%4) numPerLevel[1]++;
4710 
4711  for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth
4712  if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1'
4713  noaff_depth++;
4714 
4715  kmp_uint32 branch = 4;
4716  if (numPerLevel[0] == 1) branch = nprocs/4;
4717  if (branch<4) branch=4;
4718  for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width
4719  while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
4720  if (numPerLevel[d] & 1) numPerLevel[d]++;
4721  numPerLevel[d] = numPerLevel[d] >> 1;
4722  if (numPerLevel[d+1] == 1) noaff_depth++;
4723  numPerLevel[d+1] = numPerLevel[d+1] << 1;
4724  }
4725  if(numPerLevel[0] == 1) {
4726  branch = branch >> 1;
4727  if (branch<4) branch = 4;
4728  }
4729  }
4730 
4731  for (kmp_uint32 i=1; i<noaff_depth; ++i)
4732  noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1];
4733  // Fill in hierarchy in the case of oversubscription
4734  for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i)
4735  noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1];
4736  noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
4737  noaff_uninitialized = 0; // One writer
4738 
4739 }
4740 
4741 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
4742  if (noaff_uninitialized)
4743  noaff_init(nproc);
4744 
4745  thr_bar->depth = noaff_depth;
4746  thr_bar->base_leaf_kids = noaff_leaf_kids;
4747  thr_bar->skip_per_level = noaff_skipPerLevel;
4748 }
4749 
4750 #endif // KMP_AFFINITY_SUPPORTED