llvm-project/openmp/runtime/src/kmp_affinity.cpp
Chandler Carruth 57b08b0944 Update more file headers across all of the LLVM projects in the monorepo
to reflect the new license. These used slightly different spellings that
defeated my regular expressions.

We understand that people may be surprised that we're moving the header
entirely to discuss the new license. We checked this carefully with the
Foundation's lawyer and we believe this is the correct approach.

Essentially, all code in the project is now made available by the LLVM
project under our new license, so you will see that the license headers
include that license only. Some of our contributors have contributed
code under our old license, and accordingly, we have retained a copy of
our old license notice in the top-level files in each project and
repository.

llvm-svn: 351648
2019-01-19 10:56:40 +00:00

5379 lines
185 KiB
C++

/*
* kmp_affinity.cpp -- affinity management
*/
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "kmp.h"
#include "kmp_affinity.h"
#include "kmp_i18n.h"
#include "kmp_io.h"
#include "kmp_str.h"
#include "kmp_wrapper_getpid.h"
#if KMP_USE_HIER_SCHED
#include "kmp_dispatch_hier.h"
#endif
// Store the real or imagined machine hierarchy here
static hierarchy_info machine_hierarchy;
void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
kmp_uint32 depth;
// The test below is true if affinity is available, but set to "none". Need to
// init on first use of hierarchical barrier.
if (TCR_1(machine_hierarchy.uninitialized))
machine_hierarchy.init(NULL, nproc);
// Adjust the hierarchy in case num threads exceeds original
if (nproc > machine_hierarchy.base_num_threads)
machine_hierarchy.resize(nproc);
depth = machine_hierarchy.depth;
KMP_DEBUG_ASSERT(depth > 0);
thr_bar->depth = depth;
thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1;
thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
}
#if KMP_AFFINITY_SUPPORTED
bool KMPAffinity::picked_api = false;
void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
void KMPAffinity::pick_api() {
KMPAffinity *affinity_dispatch;
if (picked_api)
return;
#if KMP_USE_HWLOC
// Only use Hwloc if affinity isn't explicitly disabled and
// user requests Hwloc topology method
if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
__kmp_affinity_type != affinity_disabled) {
affinity_dispatch = new KMPHwlocAffinity();
} else
#endif
{
affinity_dispatch = new KMPNativeAffinity();
}
__kmp_affinity_dispatch = affinity_dispatch;
picked_api = true;
}
void KMPAffinity::destroy_api() {
if (__kmp_affinity_dispatch != NULL) {
delete __kmp_affinity_dispatch;
__kmp_affinity_dispatch = NULL;
picked_api = false;
}
}
#define KMP_ADVANCE_SCAN(scan) \
while (*scan != '\0') { \
scan++; \
}
// Print the affinity mask to the character array in a pretty format.
// The format is a comma separated list of non-negative integers or integer
// ranges: e.g., 1,2,3-5,7,9-15
// The format can also be the string "{<empty>}" if no bits are set in mask
char *__kmp_affinity_print_mask(char *buf, int buf_len,
kmp_affin_mask_t *mask) {
int start = 0, finish = 0, previous = 0;
bool first_range;
KMP_ASSERT(buf);
KMP_ASSERT(buf_len >= 40);
KMP_ASSERT(mask);
char *scan = buf;
char *end = buf + buf_len - 1;
// Check for empty set.
if (mask->begin() == mask->end()) {
KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
KMP_ADVANCE_SCAN(scan);
KMP_ASSERT(scan <= end);
return buf;
}
first_range = true;
start = mask->begin();
while (1) {
// Find next range
// [start, previous] is inclusive range of contiguous bits in mask
for (finish = mask->next(start), previous = start;
finish == previous + 1 && finish != mask->end();
finish = mask->next(finish)) {
previous = finish;
}
// The first range does not need a comma printed before it, but the rest
// of the ranges do need a comma beforehand
if (!first_range) {
KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
KMP_ADVANCE_SCAN(scan);
} else {
first_range = false;
}
// Range with three or more contiguous bits in the affinity mask
if (previous - start > 1) {
KMP_SNPRINTF(scan, end - scan + 1, "%d-%d", static_cast<int>(start),
static_cast<int>(previous));
} else {
// Range with one or two contiguous bits in the affinity mask
KMP_SNPRINTF(scan, end - scan + 1, "%d", static_cast<int>(start));
KMP_ADVANCE_SCAN(scan);
if (previous - start > 0) {
KMP_SNPRINTF(scan, end - scan + 1, ",%d", static_cast<int>(previous));
}
}
KMP_ADVANCE_SCAN(scan);
// Start over with new start point
start = finish;
if (start == mask->end())
break;
// Check for overflow
if (end - scan < 2)
break;
}
// Check for overflow
KMP_ASSERT(scan <= end);
return buf;
}
#undef KMP_ADVANCE_SCAN
// Print the affinity mask to the string buffer object in a pretty format
// The format is a comma separated list of non-negative integers or integer
// ranges: e.g., 1,2,3-5,7,9-15
// The format can also be the string "{<empty>}" if no bits are set in mask
kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
kmp_affin_mask_t *mask) {
int start = 0, finish = 0, previous = 0;
bool first_range;
KMP_ASSERT(buf);
KMP_ASSERT(mask);
__kmp_str_buf_clear(buf);
// Check for empty set.
if (mask->begin() == mask->end()) {
__kmp_str_buf_print(buf, "%s", "{<empty>}");
return buf;
}
first_range = true;
start = mask->begin();
while (1) {
// Find next range
// [start, previous] is inclusive range of contiguous bits in mask
for (finish = mask->next(start), previous = start;
finish == previous + 1 && finish != mask->end();
finish = mask->next(finish)) {
previous = finish;
}
// The first range does not need a comma printed before it, but the rest
// of the ranges do need a comma beforehand
if (!first_range) {
__kmp_str_buf_print(buf, "%s", ",");
} else {
first_range = false;
}
// Range with three or more contiguous bits in the affinity mask
if (previous - start > 1) {
__kmp_str_buf_print(buf, "%d-%d", static_cast<int>(start),
static_cast<int>(previous));
} else {
// Range with one or two contiguous bits in the affinity mask
__kmp_str_buf_print(buf, "%d", static_cast<int>(start));
if (previous - start > 0) {
__kmp_str_buf_print(buf, ",%d", static_cast<int>(previous));
}
}
// Start over with new start point
start = finish;
if (start == mask->end())
break;
}
return buf;
}
void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
KMP_CPU_ZERO(mask);
#if KMP_GROUP_AFFINITY
if (__kmp_num_proc_groups > 1) {
int group;
KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
for (group = 0; group < __kmp_num_proc_groups; group++) {
int i;
int num = __kmp_GetActiveProcessorCount(group);
for (i = 0; i < num; i++) {
KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
}
}
} else
#endif /* KMP_GROUP_AFFINITY */
{
int proc;
for (proc = 0; proc < __kmp_xproc; proc++) {
KMP_CPU_SET(proc, mask);
}
}
}
// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
// called to renumber the labels from [0..n] and place them into the child_num
// vector of the address object. This is done in case the labels used for
// the children at one node of the hierarchy differ from those used for
// another node at the same level. Example: suppose the machine has 2 nodes
// with 2 packages each. The first node contains packages 601 and 602, and
// second node contains packages 603 and 604. If we try to sort the table
// for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
// because we are paying attention to the labels themselves, not the ordinal
// child numbers. By using the child numbers in the sort, the result is
// {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
int numAddrs) {
KMP_DEBUG_ASSERT(numAddrs > 0);
int depth = address2os->first.depth;
unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
int labCt;
for (labCt = 0; labCt < depth; labCt++) {
address2os[0].first.childNums[labCt] = counts[labCt] = 0;
lastLabel[labCt] = address2os[0].first.labels[labCt];
}
int i;
for (i = 1; i < numAddrs; i++) {
for (labCt = 0; labCt < depth; labCt++) {
if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
int labCt2;
for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
counts[labCt2] = 0;
lastLabel[labCt2] = address2os[i].first.labels[labCt2];
}
counts[labCt]++;
lastLabel[labCt] = address2os[i].first.labels[labCt];
break;
}
}
for (labCt = 0; labCt < depth; labCt++) {
address2os[i].first.childNums[labCt] = counts[labCt];
}
for (; labCt < (int)Address::maxDepth; labCt++) {
address2os[i].first.childNums[labCt] = 0;
}
}
__kmp_free(lastLabel);
__kmp_free(counts);
}
// All of the __kmp_affinity_create_*_map() routines should set
// __kmp_affinity_masks to a vector of affinity mask objects of length
// __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
// the number of levels in the machine topology tree (zero if
// __kmp_affinity_type == affinity_none).
//
// All of the __kmp_affinity_create_*_map() routines should set
// *__kmp_affin_fullMask to the affinity mask for the initialization thread.
// They need to save and restore the mask, and it could be needed later, so
// saving it is just an optimization to avoid calling kmp_get_system_affinity()
// again.
kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
static int nCoresPerPkg, nPackages;
static int __kmp_nThreadsPerCore;
#ifndef KMP_DFLT_NTH_CORES
static int __kmp_ncores;
#endif
static int *__kmp_pu_os_idx = NULL;
// __kmp_affinity_uniform_topology() doesn't work when called from
// places which support arbitrarily many levels in the machine topology
// map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
// __kmp_affinity_create_x2apicid_map().
inline static bool __kmp_affinity_uniform_topology() {
return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
}
// Print out the detailed machine topology map, i.e. the physical locations
// of each OS proc.
static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
int depth, int pkgLevel,
int coreLevel, int threadLevel) {
int proc;
KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
for (proc = 0; proc < len; proc++) {
int level;
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
for (level = 0; level < depth; level++) {
if (level == threadLevel) {
__kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
} else if (level == coreLevel) {
__kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
} else if (level == pkgLevel) {
__kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
} else if (level > pkgLevel) {
__kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
level - pkgLevel - 1);
} else {
__kmp_str_buf_print(&buf, "L%d ", level);
}
__kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
}
KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
buf.str);
__kmp_str_buf_free(&buf);
}
}
#if KMP_USE_HWLOC
static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
int depth, int *levels) {
int proc;
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
for (proc = 0; proc < len; proc++) {
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
addrP[proc].first.labels[0]);
if (depth > 1) {
int level = 1; // iterate over levels
int label = 1; // iterate over labels
if (__kmp_numa_detected)
// node level follows package
if (levels[level++] > 0)
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
addrP[proc].first.labels[label++]);
if (__kmp_tile_depth > 0)
// tile level follows node if any, or package
if (levels[level++] > 0)
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
addrP[proc].first.labels[label++]);
if (levels[level++] > 0)
// core level follows
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
addrP[proc].first.labels[label++]);
if (levels[level++] > 0)
// thread level is the latest
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
addrP[proc].first.labels[label++]);
KMP_DEBUG_ASSERT(label == depth);
}
KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
__kmp_str_buf_clear(&buf);
}
__kmp_str_buf_free(&buf);
}
static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
// This function removes the topology levels that are radix 1 and don't offer
// further information about the topology. The most common example is when you
// have one thread context per core, we don't want the extra thread context
// level if it offers no unique labels. So they are removed.
// return value: the new depth of address2os
static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
int depth, int *levels) {
int level;
int i;
int radix1_detected;
int new_depth = depth;
for (level = depth - 1; level > 0; --level) {
// Detect if this level is radix 1
radix1_detected = 1;
for (i = 1; i < nTh; ++i) {
if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
// There are differing label values for this level so it stays
radix1_detected = 0;
break;
}
}
if (!radix1_detected)
continue;
// Radix 1 was detected
--new_depth;
levels[level] = -1; // mark level as not present in address2os array
if (level == new_depth) {
// "turn off" deepest level, just decrement the depth that removes
// the level from address2os array
for (i = 0; i < nTh; ++i) {
addrP[i].first.depth--;
}
} else {
// For other levels, we move labels over and also reduce the depth
int j;
for (j = level; j < new_depth; ++j) {
for (i = 0; i < nTh; ++i) {
addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
addrP[i].first.depth--;
}
levels[j + 1] -= 1;
}
}
}
return new_depth;
}
// Returns the number of objects of type 'type' below 'obj' within the topology
// tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
// HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
// object.
static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
hwloc_obj_type_t type) {
int retval = 0;
hwloc_obj_t first;
for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
obj->logical_index, type, 0);
first != NULL &&
hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) ==
obj;
first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
first)) {
++retval;
}
return retval;
}
static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
hwloc_obj_t o, unsigned depth,
hwloc_obj_t *f) {
if (o->depth == depth) {
if (*f == NULL)
*f = o; // output first descendant found
return 1;
}
int sum = 0;
for (unsigned i = 0; i < o->arity; i++)
sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
return sum; // will be 0 if no one found (as PU arity is 0)
}
static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
hwloc_obj_type_t type,
hwloc_obj_t *f) {
if (!hwloc_compare_types(o->type, type)) {
if (*f == NULL)
*f = o; // output first descendant found
return 1;
}
int sum = 0;
for (unsigned i = 0; i < o->arity; i++)
sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
return sum; // will be 0 if no one found (as PU arity is 0)
}
static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
int &nActiveThreads,
int &num_active_cores,
hwloc_obj_t obj, int depth,
int *labels) {
hwloc_obj_t core = NULL;
hwloc_topology_t &tp = __kmp_hwloc_topology;
int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
hwloc_obj_t pu = NULL;
KMP_DEBUG_ASSERT(core != NULL);
int num_active_threads = 0;
int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
// int NT = core->arity; pu = core->first_child; // faster?
for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
KMP_DEBUG_ASSERT(pu != NULL);
if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
continue; // skip inactive (inaccessible) unit
Address addr(depth + 2);
KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
obj->os_index, obj->logical_index, core->os_index,
core->logical_index, pu->os_index, pu->logical_index));
for (int i = 0; i < depth; ++i)
addr.labels[i] = labels[i]; // package, etc.
addr.labels[depth] = core_id; // core
addr.labels[depth + 1] = pu_id; // pu
addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
__kmp_pu_os_idx[nActiveThreads] = pu->os_index;
nActiveThreads++;
++num_active_threads; // count active threads per core
}
if (num_active_threads) { // were there any active threads on the core?
++__kmp_ncores; // count total active cores
++num_active_cores; // count active cores per socket
if (num_active_threads > __kmp_nThreadsPerCore)
__kmp_nThreadsPerCore = num_active_threads; // calc maximum
}
}
return 0;
}
// Check if NUMA node detected below the package,
// and if tile object is detected and return its depth
static int __kmp_hwloc_check_numa() {
hwloc_topology_t &tp = __kmp_hwloc_topology;
hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
int depth;
// Get some PU
hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
if (hT == NULL) // something has gone wrong
return 1;
// check NUMA node below PACKAGE
hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
KMP_DEBUG_ASSERT(hS != NULL);
if (hN != NULL && hN->depth > hS->depth) {
__kmp_numa_detected = TRUE; // socket includes node(s)
if (__kmp_affinity_gran == affinity_gran_node) {
__kmp_affinity_gran == affinity_gran_numa;
}
}
// check tile, get object by depth because of multiple caches possible
depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
hC = NULL; // not used, but reset it here just in case
if (hL != NULL &&
__kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
__kmp_tile_depth = depth; // tile consists of multiple cores
return 0;
}
static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
kmp_i18n_id_t *const msg_id) {
hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
*address2os = NULL;
*msg_id = kmp_i18n_null;
// Save the affinity mask for the current thread.
kmp_affin_mask_t *oldMask;
KMP_CPU_ALLOC(oldMask);
__kmp_get_system_affinity(oldMask, TRUE);
__kmp_hwloc_check_numa();
if (!KMP_AFFINITY_CAPABLE()) {
// Hack to try and infer the machine topology using only the data
// available from cpuid on the current thread, and __kmp_xproc.
KMP_ASSERT(__kmp_affinity_type == affinity_none);
nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(
hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE);
__kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(
hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
__kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
if (__kmp_affinity_verbose) {
KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (__kmp_affinity_uniform_topology()) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
KMP_CPU_FREE(oldMask);
return 0;
}
int depth = 3;
int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
int labels[3] = {0}; // package [,node] [,tile] - head of lables array
if (__kmp_numa_detected)
++depth;
if (__kmp_tile_depth)
++depth;
// Allocate the data structure to be returned.
AddrUnsPair *retval =
(AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
__kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
// When affinity is off, this routine will still be called to set
// __kmp_ncores, as well as __kmp_nThreadsPerCore,
// nCoresPerPkg, & nPackages. Make sure all these vars are set
// correctly, and return if affinity is not enabled.
hwloc_obj_t socket, node, tile;
int nActiveThreads = 0;
int socket_id = 0;
// re-calculate globals to count only accessible resources
__kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
socket_id++) {
labels[0] = socket_id;
if (__kmp_numa_detected) {
int NN;
int n_active_nodes = 0;
node = NULL;
NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
&node);
for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
labels[1] = node_id;
if (__kmp_tile_depth) {
// NUMA + tiles
int NT;
int n_active_tiles = 0;
tile = NULL;
NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
&tile);
for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
labels[2] = tl_id;
int n_active_cores = 0;
__kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
n_active_cores, tile, 3, labels);
if (n_active_cores) { // were there any active cores on the socket?
++n_active_tiles; // count active tiles per node
if (n_active_cores > nCorePerTile)
nCorePerTile = n_active_cores; // calc maximum
}
}
if (n_active_tiles) { // were there any active tiles on the socket?
++n_active_nodes; // count active nodes per package
if (n_active_tiles > nTilePerNode)
nTilePerNode = n_active_tiles; // calc maximum
}
} else {
// NUMA, no tiles
int n_active_cores = 0;
__kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
n_active_cores, node, 2, labels);
if (n_active_cores) { // were there any active cores on the socket?
++n_active_nodes; // count active nodes per package
if (n_active_cores > nCorePerNode)
nCorePerNode = n_active_cores; // calc maximum
}
}
}
if (n_active_nodes) { // were there any active nodes on the socket?
++nPackages; // count total active packages
if (n_active_nodes > nNodePerPkg)
nNodePerPkg = n_active_nodes; // calc maximum
}
} else {
if (__kmp_tile_depth) {
// no NUMA, tiles
int NT;
int n_active_tiles = 0;
tile = NULL;
NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
&tile);
for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
labels[1] = tl_id;
int n_active_cores = 0;
__kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
n_active_cores, tile, 2, labels);
if (n_active_cores) { // were there any active cores on the socket?
++n_active_tiles; // count active tiles per package
if (n_active_cores > nCorePerTile)
nCorePerTile = n_active_cores; // calc maximum
}
}
if (n_active_tiles) { // were there any active tiles on the socket?
++nPackages; // count total active packages
if (n_active_tiles > nTilePerPkg)
nTilePerPkg = n_active_tiles; // calc maximum
}
} else {
// no NUMA, no tiles
int n_active_cores = 0;
__kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
socket, 1, labels);
if (n_active_cores) { // were there any active cores on the socket?
++nPackages; // count total active packages
if (n_active_cores > nCoresPerPkg)
nCoresPerPkg = n_active_cores; // calc maximum
}
}
}
}
// If there's only one thread context to bind to, return now.
KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
KMP_ASSERT(nActiveThreads > 0);
if (nActiveThreads == 1) {
__kmp_ncores = nPackages = 1;
__kmp_nThreadsPerCore = nCoresPerPkg = 1;
if (__kmp_affinity_verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
KMP_INFORM(Uniform, "KMP_AFFINITY");
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
if (__kmp_affinity_type == affinity_none) {
__kmp_free(retval);
KMP_CPU_FREE(oldMask);
return 0;
}
// Form an Address object which only includes the package level.
Address addr(1);
addr.labels[0] = retval[0].first.labels[0];
retval[0].first = addr;
if (__kmp_affinity_gran_levels < 0) {
__kmp_affinity_gran_levels = 0;
}
if (__kmp_affinity_verbose) {
__kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
}
*address2os = retval;
KMP_CPU_FREE(oldMask);
return 1;
}
// Sort the table by physical Id.
qsort(retval, nActiveThreads, sizeof(*retval),
__kmp_affinity_cmp_Address_labels);
// Check to see if the machine topology is uniform
int nPUs = nPackages * __kmp_nThreadsPerCore;
if (__kmp_numa_detected) {
if (__kmp_tile_depth) { // NUMA + tiles
nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
} else { // NUMA, no tiles
nPUs *= (nNodePerPkg * nCorePerNode);
}
} else {
if (__kmp_tile_depth) { // no NUMA, tiles
nPUs *= (nTilePerPkg * nCorePerTile);
} else { // no NUMA, no tiles
nPUs *= nCoresPerPkg;
}
}
unsigned uniform = (nPUs == nActiveThreads);
// Print the machine topology summary.
if (__kmp_affinity_verbose) {
char mask[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (uniform) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
if (__kmp_numa_detected) {
if (__kmp_tile_depth) { // NUMA + tiles
KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
__kmp_ncores);
} else { // NUMA, no tiles
KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
nPUs *= (nNodePerPkg * nCorePerNode);
}
} else {
if (__kmp_tile_depth) { // no NUMA, tiles
KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
} else { // no NUMA, no tiles
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
__kmp_str_buf_print(&buf, "%d", nPackages);
KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
__kmp_str_buf_free(&buf);
}
}
}
if (__kmp_affinity_type == affinity_none) {
__kmp_free(retval);
KMP_CPU_FREE(oldMask);
return 0;
}
int depth_full = depth; // number of levels before compressing
// Find any levels with radiix 1, and remove them from the map
// (except for the package level).
depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
levels);
KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
if (__kmp_affinity_gran_levels < 0) {
// Set the granularity level based on what levels are modeled
// in the machine topology map.
__kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
if (__kmp_affinity_gran > affinity_gran_thread) {
for (int i = 1; i <= depth_full; ++i) {
if (__kmp_affinity_gran <= i) // only count deeper levels
break;
if (levels[depth_full - i] > 0)
__kmp_affinity_gran_levels++;
}
}
if (__kmp_affinity_gran > affinity_gran_package)
__kmp_affinity_gran_levels++; // e.g. granularity = group
}
if (__kmp_affinity_verbose)
__kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
KMP_CPU_FREE(oldMask);
*address2os = retval;
return depth;
}
#endif // KMP_USE_HWLOC
// If we don't know how to retrieve the machine's processor topology, or
// encounter an error in doing so, this routine is called to form a "flat"
// mapping of os thread id's <-> processor id's.
static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
kmp_i18n_id_t *const msg_id) {
*address2os = NULL;
*msg_id = kmp_i18n_null;
// Even if __kmp_affinity_type == affinity_none, this routine might still
// called to set __kmp_ncores, as well as
// __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
if (!KMP_AFFINITY_CAPABLE()) {
KMP_ASSERT(__kmp_affinity_type == affinity_none);
__kmp_ncores = nPackages = __kmp_xproc;
__kmp_nThreadsPerCore = nCoresPerPkg = 1;
if (__kmp_affinity_verbose) {
KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
KMP_INFORM(Uniform, "KMP_AFFINITY");
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
return 0;
}
// When affinity is off, this routine will still be called to set
// __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
// Make sure all these vars are set correctly, and return now if affinity is
// not enabled.
__kmp_ncores = nPackages = __kmp_avail_proc;
__kmp_nThreadsPerCore = nCoresPerPkg = 1;
if (__kmp_affinity_verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
__kmp_affin_fullMask);
KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
KMP_INFORM(Uniform, "KMP_AFFINITY");
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
__kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
if (__kmp_affinity_type == affinity_none) {
int avail_ct = 0;
int i;
KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
continue;
__kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
}
return 0;
}
// Contruct the data structure to be returned.
*address2os =
(AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
int avail_ct = 0;
int i;
KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
// Skip this proc if it is not included in the machine model.
if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
continue;
}
__kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
Address addr(1);
addr.labels[0] = i;
(*address2os)[avail_ct++] = AddrUnsPair(addr, i);
}
if (__kmp_affinity_verbose) {
KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
}
if (__kmp_affinity_gran_levels < 0) {
// Only the package level is modeled in the machine topology map,
// so the #levels of granularity is either 0 or 1.
if (__kmp_affinity_gran > affinity_gran_package) {
__kmp_affinity_gran_levels = 1;
} else {
__kmp_affinity_gran_levels = 0;
}
}
return 1;
}
#if KMP_GROUP_AFFINITY
// If multiple Windows* OS processor groups exist, we can create a 2-level
// topology map with the groups at level 0 and the individual procs at level 1.
// This facilitates letting the threads float among all procs in a group,
// if granularity=group (the default when there are multiple groups).
static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
kmp_i18n_id_t *const msg_id) {
*address2os = NULL;
*msg_id = kmp_i18n_null;
// If we aren't affinity capable, then return now.
// The flat mapping will be used.
if (!KMP_AFFINITY_CAPABLE()) {
// FIXME set *msg_id
return -1;
}
// Contruct the data structure to be returned.
*address2os =
(AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
__kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
int avail_ct = 0;
int i;
KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
// Skip this proc if it is not included in the machine model.
if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
continue;
}
__kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
Address addr(2);
addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
(*address2os)[avail_ct++] = AddrUnsPair(addr, i);
if (__kmp_affinity_verbose) {
KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
addr.labels[1]);
}
}
if (__kmp_affinity_gran_levels < 0) {
if (__kmp_affinity_gran == affinity_gran_group) {
__kmp_affinity_gran_levels = 1;
} else if ((__kmp_affinity_gran == affinity_gran_fine) ||
(__kmp_affinity_gran == affinity_gran_thread)) {
__kmp_affinity_gran_levels = 0;
} else {
const char *gran_str = NULL;
if (__kmp_affinity_gran == affinity_gran_core) {
gran_str = "core";
} else if (__kmp_affinity_gran == affinity_gran_package) {
gran_str = "package";
} else if (__kmp_affinity_gran == affinity_gran_node) {
gran_str = "node";
} else {
KMP_ASSERT(0);
}
// Warning: can't use affinity granularity \"gran\" with group topology
// method, using "thread"
__kmp_affinity_gran_levels = 0;
}
}
return 2;
}
#endif /* KMP_GROUP_AFFINITY */
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
static int __kmp_cpuid_mask_width(int count) {
int r = 0;
while ((1 << r) < count)
++r;
return r;
}
class apicThreadInfo {
public:
unsigned osId; // param to __kmp_affinity_bind_thread
unsigned apicId; // from cpuid after binding
unsigned maxCoresPerPkg; // ""
unsigned maxThreadsPerPkg; // ""
unsigned pkgId; // inferred from above values
unsigned coreId; // ""
unsigned threadId; // ""
};
static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
const void *b) {
const apicThreadInfo *aa = (const apicThreadInfo *)a;
const apicThreadInfo *bb = (const apicThreadInfo *)b;
if (aa->pkgId < bb->pkgId)
return -1;
if (aa->pkgId > bb->pkgId)
return 1;
if (aa->coreId < bb->coreId)
return -1;
if (aa->coreId > bb->coreId)
return 1;
if (aa->threadId < bb->threadId)
return -1;
if (aa->threadId > bb->threadId)
return 1;
return 0;
}
// On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
// an algorithm which cycles through the available os threads, setting
// the current thread's affinity mask to that thread, and then retrieves
// the Apic Id for each thread context using the cpuid instruction.
static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
kmp_i18n_id_t *const msg_id) {
kmp_cpuid buf;
*address2os = NULL;
*msg_id = kmp_i18n_null;
// Check if cpuid leaf 4 is supported.
__kmp_x86_cpuid(0, 0, &buf);
if (buf.eax < 4) {
*msg_id = kmp_i18n_str_NoLeaf4Support;
return -1;
}
// The algorithm used starts by setting the affinity to each available thread
// and retrieving info from the cpuid instruction, so if we are not capable of
// calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
// need to do something else - use the defaults that we calculated from
// issuing cpuid without binding to each proc.
if (!KMP_AFFINITY_CAPABLE()) {
// Hack to try and infer the machine topology using only the data
// available from cpuid on the current thread, and __kmp_xproc.
KMP_ASSERT(__kmp_affinity_type == affinity_none);
// Get an upper bound on the number of threads per package using cpuid(1).
// On some OS/chps combinations where HT is supported by the chip but is
// disabled, this value will be 2 on a single core chip. Usually, it will be
// 2 if HT is enabled and 1 if HT is disabled.
__kmp_x86_cpuid(1, 0, &buf);
int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
if (maxThreadsPerPkg == 0) {
maxThreadsPerPkg = 1;
}
// The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
// value.
//
// The author of cpu_count.cpp treated this only an upper bound on the
// number of cores, but I haven't seen any cases where it was greater than
// the actual number of cores, so we will treat it as exact in this block of
// code.
//
// First, we need to check if cpuid(4) is supported on this chip. To see if
// cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
// greater.
__kmp_x86_cpuid(0, 0, &buf);
if (buf.eax >= 4) {
__kmp_x86_cpuid(4, 0, &buf);
nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
} else {
nCoresPerPkg = 1;
}
// There is no way to reliably tell if HT is enabled without issuing the
// cpuid instruction from every thread, can correlating the cpuid info, so
// if the machine is not affinity capable, we assume that HT is off. We have
// seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
// does not support HT.
//
// - Older OSes are usually found on machines with older chips, which do not
// support HT.
// - The performance penalty for mistakenly identifying a machine as HT when
// it isn't (which results in blocktime being incorrecly set to 0) is
// greater than the penalty when for mistakenly identifying a machine as
// being 1 thread/core when it is really HT enabled (which results in
// blocktime being incorrectly set to a positive value).
__kmp_ncores = __kmp_xproc;
nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
__kmp_nThreadsPerCore = 1;
if (__kmp_affinity_verbose) {
KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (__kmp_affinity_uniform_topology()) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
return 0;
}
// From here on, we can assume that it is safe to call
// __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
// __kmp_affinity_type = affinity_none.
// Save the affinity mask for the current thread.
kmp_affin_mask_t *oldMask;
KMP_CPU_ALLOC(oldMask);
KMP_ASSERT(oldMask != NULL);
__kmp_get_system_affinity(oldMask, TRUE);
// Run through each of the available contexts, binding the current thread
// to it, and obtaining the pertinent information using the cpuid instr.
//
// The relevant information is:
// - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
// has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
// - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
// of this field determines the width of the core# + thread# fields in the
// Apic Id. It is also an upper bound on the number of threads per
// package, but it has been verified that situations happen were it is not
// exact. In particular, on certain OS/chip combinations where Intel(R)
// Hyper-Threading Technology is supported by the chip but has been
// disabled, the value of this field will be 2 (for a single core chip).
// On other OS/chip combinations supporting Intel(R) Hyper-Threading
// Technology, the value of this field will be 1 when Intel(R)
// Hyper-Threading Technology is disabled and 2 when it is enabled.
// - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
// of this field (+1) determines the width of the core# field in the Apic
// Id. The comments in "cpucount.cpp" say that this value is an upper
// bound, but the IA-32 architecture manual says that it is exactly the
// number of cores per package, and I haven't seen any case where it
// wasn't.
//
// From this information, deduce the package Id, core Id, and thread Id,
// and set the corresponding fields in the apicThreadInfo struct.
unsigned i;
apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
__kmp_avail_proc * sizeof(apicThreadInfo));
unsigned nApics = 0;
KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
// Skip this proc if it is not included in the machine model.
if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
continue;
}
KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
__kmp_affinity_dispatch->bind_thread(i);
threadInfo[nApics].osId = i;
// The apic id and max threads per pkg come from cpuid(1).
__kmp_x86_cpuid(1, 0, &buf);
if (((buf.edx >> 9) & 1) == 0) {
__kmp_set_system_affinity(oldMask, TRUE);
__kmp_free(threadInfo);
KMP_CPU_FREE(oldMask);
*msg_id = kmp_i18n_str_ApicNotPresent;
return -1;
}
threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
if (threadInfo[nApics].maxThreadsPerPkg == 0) {
threadInfo[nApics].maxThreadsPerPkg = 1;
}
// Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
// value.
//
// First, we need to check if cpuid(4) is supported on this chip. To see if
// cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
// or greater.
__kmp_x86_cpuid(0, 0, &buf);
if (buf.eax >= 4) {
__kmp_x86_cpuid(4, 0, &buf);
threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
} else {
threadInfo[nApics].maxCoresPerPkg = 1;
}
// Infer the pkgId / coreId / threadId using only the info obtained locally.
int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
int widthT = widthCT - widthC;
if (widthT < 0) {
// I've never seen this one happen, but I suppose it could, if the cpuid
// instruction on a chip was really screwed up. Make sure to restore the
// affinity mask before the tail call.
__kmp_set_system_affinity(oldMask, TRUE);
__kmp_free(threadInfo);
KMP_CPU_FREE(oldMask);
*msg_id = kmp_i18n_str_InvalidCpuidInfo;
return -1;
}
int maskC = (1 << widthC) - 1;
threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
int maskT = (1 << widthT) - 1;
threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
nApics++;
}
// We've collected all the info we need.
// Restore the old affinity mask for this thread.
__kmp_set_system_affinity(oldMask, TRUE);
// If there's only one thread context to bind to, form an Address object
// with depth 1 and return immediately (or, if affinity is off, set
// address2os to NULL and return).
//
// If it is configured to omit the package level when there is only a single
// package, the logic at the end of this routine won't work if there is only
// a single thread - it would try to form an Address object with depth 0.
KMP_ASSERT(nApics > 0);
if (nApics == 1) {
__kmp_ncores = nPackages = 1;
__kmp_nThreadsPerCore = nCoresPerPkg = 1;
if (__kmp_affinity_verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
KMP_INFORM(Uniform, "KMP_AFFINITY");
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
if (__kmp_affinity_type == affinity_none) {
__kmp_free(threadInfo);
KMP_CPU_FREE(oldMask);
return 0;
}
*address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
Address addr(1);
addr.labels[0] = threadInfo[0].pkgId;
(*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
if (__kmp_affinity_gran_levels < 0) {
__kmp_affinity_gran_levels = 0;
}
if (__kmp_affinity_verbose) {
__kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
}
__kmp_free(threadInfo);
KMP_CPU_FREE(oldMask);
return 1;
}
// Sort the threadInfo table by physical Id.
qsort(threadInfo, nApics, sizeof(*threadInfo),
__kmp_affinity_cmp_apicThreadInfo_phys_id);
// The table is now sorted by pkgId / coreId / threadId, but we really don't
// know the radix of any of the fields. pkgId's may be sparsely assigned among
// the chips on a system. Although coreId's are usually assigned
// [0 .. coresPerPkg-1] and threadId's are usually assigned
// [0..threadsPerCore-1], we don't want to make any such assumptions.
//
// For that matter, we don't know what coresPerPkg and threadsPerCore (or the
// total # packages) are at this point - we want to determine that now. We
// only have an upper bound on the first two figures.
//
// We also perform a consistency check at this point: the values returned by
// the cpuid instruction for any thread bound to a given package had better
// return the same info for maxThreadsPerPkg and maxCoresPerPkg.
nPackages = 1;
nCoresPerPkg = 1;
__kmp_nThreadsPerCore = 1;
unsigned nCores = 1;
unsigned pkgCt = 1; // to determine radii
unsigned lastPkgId = threadInfo[0].pkgId;
unsigned coreCt = 1;
unsigned lastCoreId = threadInfo[0].coreId;
unsigned threadCt = 1;
unsigned lastThreadId = threadInfo[0].threadId;
// intra-pkg consist checks
unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
for (i = 1; i < nApics; i++) {
if (threadInfo[i].pkgId != lastPkgId) {
nCores++;
pkgCt++;
lastPkgId = threadInfo[i].pkgId;
if ((int)coreCt > nCoresPerPkg)
nCoresPerPkg = coreCt;
coreCt = 1;
lastCoreId = threadInfo[i].coreId;
if ((int)threadCt > __kmp_nThreadsPerCore)
__kmp_nThreadsPerCore = threadCt;
threadCt = 1;
lastThreadId = threadInfo[i].threadId;
// This is a different package, so go on to the next iteration without
// doing any consistency checks. Reset the consistency check vars, though.
prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
continue;
}
if (threadInfo[i].coreId != lastCoreId) {
nCores++;
coreCt++;
lastCoreId = threadInfo[i].coreId;
if ((int)threadCt > __kmp_nThreadsPerCore)
__kmp_nThreadsPerCore = threadCt;
threadCt = 1;
lastThreadId = threadInfo[i].threadId;
} else if (threadInfo[i].threadId != lastThreadId) {
threadCt++;
lastThreadId = threadInfo[i].threadId;
} else {
__kmp_free(threadInfo);
KMP_CPU_FREE(oldMask);
*msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
return -1;
}
// Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
// fields agree between all the threads bounds to a given package.
if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
(prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
__kmp_free(threadInfo);
KMP_CPU_FREE(oldMask);
*msg_id = kmp_i18n_str_InconsistentCpuidInfo;
return -1;
}
}
nPackages = pkgCt;
if ((int)coreCt > nCoresPerPkg)
nCoresPerPkg = coreCt;
if ((int)threadCt > __kmp_nThreadsPerCore)
__kmp_nThreadsPerCore = threadCt;
// When affinity is off, this routine will still be called to set
// __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
// Make sure all these vars are set correctly, and return now if affinity is
// not enabled.
__kmp_ncores = nCores;
if (__kmp_affinity_verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (__kmp_affinity_uniform_topology()) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
__kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
for (i = 0; i < nApics; ++i) {
__kmp_pu_os_idx[i] = threadInfo[i].osId;
}
if (__kmp_affinity_type == affinity_none) {
__kmp_free(threadInfo);
KMP_CPU_FREE(oldMask);
return 0;
}
// Now that we've determined the number of packages, the number of cores per
// package, and the number of threads per core, we can construct the data
// structure that is to be returned.
int pkgLevel = 0;
int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
int threadLevel =
(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
KMP_ASSERT(depth > 0);
*address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
for (i = 0; i < nApics; ++i) {
Address addr(depth);
unsigned os = threadInfo[i].osId;
int d = 0;
if (pkgLevel >= 0) {
addr.labels[d++] = threadInfo[i].pkgId;
}
if (coreLevel >= 0) {
addr.labels[d++] = threadInfo[i].coreId;
}
if (threadLevel >= 0) {
addr.labels[d++] = threadInfo[i].threadId;
}
(*address2os)[i] = AddrUnsPair(addr, os);
}
if (__kmp_affinity_gran_levels < 0) {
// Set the granularity level based on what levels are modeled in the machine
// topology map.
__kmp_affinity_gran_levels = 0;
if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
__kmp_affinity_gran_levels++;
}
if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
__kmp_affinity_gran_levels++;
}
if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
__kmp_affinity_gran_levels++;
}
}
if (__kmp_affinity_verbose) {
__kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
coreLevel, threadLevel);
}
__kmp_free(threadInfo);
KMP_CPU_FREE(oldMask);
return depth;
}
// Intel(R) microarchitecture code name Nehalem, Dunnington and later
// architectures support a newer interface for specifying the x2APIC Ids,
// based on cpuid leaf 11.
static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
kmp_i18n_id_t *const msg_id) {
kmp_cpuid buf;
*address2os = NULL;
*msg_id = kmp_i18n_null;
// Check to see if cpuid leaf 11 is supported.
__kmp_x86_cpuid(0, 0, &buf);
if (buf.eax < 11) {
*msg_id = kmp_i18n_str_NoLeaf11Support;
return -1;
}
__kmp_x86_cpuid(11, 0, &buf);
if (buf.ebx == 0) {
*msg_id = kmp_i18n_str_NoLeaf11Support;
return -1;
}
// Find the number of levels in the machine topology. While we're at it, get
// the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
// get more accurate values later by explicitly counting them, but get
// reasonable defaults now, in case we return early.
int level;
int threadLevel = -1;
int coreLevel = -1;
int pkgLevel = -1;
__kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
for (level = 0;; level++) {
if (level > 31) {
// FIXME: Hack for DPD200163180
//
// If level is big then something went wrong -> exiting
//
// There could actually be 32 valid levels in the machine topology, but so
// far, the only machine we have seen which does not exit this loop before
// iteration 32 has fubar x2APIC settings.
//
// For now, just reject this case based upon loop trip count.
*msg_id = kmp_i18n_str_InvalidCpuidInfo;
return -1;
}
__kmp_x86_cpuid(11, level, &buf);
if (buf.ebx == 0) {
if (pkgLevel < 0) {
// Will infer nPackages from __kmp_xproc
pkgLevel = level;
level++;
}
break;
}
int kind = (buf.ecx >> 8) & 0xff;
if (kind == 1) {
// SMT level
threadLevel = level;
coreLevel = -1;
pkgLevel = -1;
__kmp_nThreadsPerCore = buf.ebx & 0xffff;
if (__kmp_nThreadsPerCore == 0) {
*msg_id = kmp_i18n_str_InvalidCpuidInfo;
return -1;
}
} else if (kind == 2) {
// core level
coreLevel = level;
pkgLevel = -1;
nCoresPerPkg = buf.ebx & 0xffff;
if (nCoresPerPkg == 0) {
*msg_id = kmp_i18n_str_InvalidCpuidInfo;
return -1;
}
} else {
if (level <= 0) {
*msg_id = kmp_i18n_str_InvalidCpuidInfo;
return -1;
}
if (pkgLevel >= 0) {
continue;
}
pkgLevel = level;
nPackages = buf.ebx & 0xffff;
if (nPackages == 0) {
*msg_id = kmp_i18n_str_InvalidCpuidInfo;
return -1;
}
}
}
int depth = level;
// In the above loop, "level" was counted from the finest level (usually
// thread) to the coarsest. The caller expects that we will place the labels
// in (*address2os)[].first.labels[] in the inverse order, so we need to
// invert the vars saying which level means what.
if (threadLevel >= 0) {
threadLevel = depth - threadLevel - 1;
}
if (coreLevel >= 0) {
coreLevel = depth - coreLevel - 1;
}
KMP_DEBUG_ASSERT(pkgLevel >= 0);
pkgLevel = depth - pkgLevel - 1;
// The algorithm used starts by setting the affinity to each available thread
// and retrieving info from the cpuid instruction, so if we are not capable of
// calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
// need to do something else - use the defaults that we calculated from
// issuing cpuid without binding to each proc.
if (!KMP_AFFINITY_CAPABLE()) {
// Hack to try and infer the machine topology using only the data
// available from cpuid on the current thread, and __kmp_xproc.
KMP_ASSERT(__kmp_affinity_type == affinity_none);
__kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
if (__kmp_affinity_verbose) {
KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (__kmp_affinity_uniform_topology()) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
return 0;
}
// From here on, we can assume that it is safe to call
// __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
// __kmp_affinity_type = affinity_none.
// Save the affinity mask for the current thread.
kmp_affin_mask_t *oldMask;
KMP_CPU_ALLOC(oldMask);
__kmp_get_system_affinity(oldMask, TRUE);
// Allocate the data structure to be returned.
AddrUnsPair *retval =
(AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
// Run through each of the available contexts, binding the current thread
// to it, and obtaining the pertinent information using the cpuid instr.
unsigned int proc;
int nApics = 0;
KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
// Skip this proc if it is not included in the machine model.
if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
continue;
}
KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
__kmp_affinity_dispatch->bind_thread(proc);
// Extract labels for each level in the machine topology map from Apic ID.
Address addr(depth);
int prev_shift = 0;
for (level = 0; level < depth; level++) {
__kmp_x86_cpuid(11, level, &buf);
unsigned apicId = buf.edx;
if (buf.ebx == 0) {
if (level != depth - 1) {
KMP_CPU_FREE(oldMask);
*msg_id = kmp_i18n_str_InconsistentCpuidInfo;
return -1;
}
addr.labels[depth - level - 1] = apicId >> prev_shift;
level++;
break;
}
int shift = buf.eax & 0x1f;
int mask = (1 << shift) - 1;
addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
prev_shift = shift;
}
if (level != depth) {
KMP_CPU_FREE(oldMask);
*msg_id = kmp_i18n_str_InconsistentCpuidInfo;
return -1;
}
retval[nApics] = AddrUnsPair(addr, proc);
nApics++;
}
// We've collected all the info we need.
// Restore the old affinity mask for this thread.
__kmp_set_system_affinity(oldMask, TRUE);
// If there's only one thread context to bind to, return now.
KMP_ASSERT(nApics > 0);
if (nApics == 1) {
__kmp_ncores = nPackages = 1;
__kmp_nThreadsPerCore = nCoresPerPkg = 1;
if (__kmp_affinity_verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
KMP_INFORM(Uniform, "KMP_AFFINITY");
KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
}
if (__kmp_affinity_type == affinity_none) {
__kmp_free(retval);
KMP_CPU_FREE(oldMask);
return 0;
}
// Form an Address object which only includes the package level.
Address addr(1);
addr.labels[0] = retval[0].first.labels[pkgLevel];
retval[0].first = addr;
if (__kmp_affinity_gran_levels < 0) {
__kmp_affinity_gran_levels = 0;
}
if (__kmp_affinity_verbose) {
__kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
}
*address2os = retval;
KMP_CPU_FREE(oldMask);
return 1;
}
// Sort the table by physical Id.
qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
// Find the radix at each of the levels.
unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
for (level = 0; level < depth; level++) {
totals[level] = 1;
maxCt[level] = 1;
counts[level] = 1;
last[level] = retval[0].first.labels[level];
}
// From here on, the iteration variable "level" runs from the finest level to
// the coarsest, i.e. we iterate forward through
// (*address2os)[].first.labels[] - in the previous loops, we iterated
// backwards.
for (proc = 1; (int)proc < nApics; proc++) {
int level;
for (level = 0; level < depth; level++) {
if (retval[proc].first.labels[level] != last[level]) {
int j;
for (j = level + 1; j < depth; j++) {
totals[j]++;
counts[j] = 1;
// The line below causes printing incorrect topology information in
// case the max value for some level (maxCt[level]) is encountered
// earlier than some less value while going through the array. For
// example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
// maxCt[1] == 2
// whereas it must be 4.
// TODO!!! Check if it can be commented safely
// maxCt[j] = 1;
last[j] = retval[proc].first.labels[j];
}
totals[level]++;
counts[level]++;
if (counts[level] > maxCt[level]) {
maxCt[level] = counts[level];
}
last[level] = retval[proc].first.labels[level];
break;
} else if (level == depth - 1) {
__kmp_free(last);
__kmp_free(maxCt);
__kmp_free(counts);
__kmp_free(totals);
__kmp_free(retval);
KMP_CPU_FREE(oldMask);
*msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
return -1;
}
}
}
// When affinity is off, this routine will still be called to set
// __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
// Make sure all these vars are set correctly, and return if affinity is not
// enabled.
if (threadLevel >= 0) {
__kmp_nThreadsPerCore = maxCt[threadLevel];
} else {
__kmp_nThreadsPerCore = 1;
}
nPackages = totals[pkgLevel];
if (coreLevel >= 0) {
__kmp_ncores = totals[coreLevel];
nCoresPerPkg = maxCt[coreLevel];
} else {
__kmp_ncores = nPackages;
nCoresPerPkg = 1;
}
// Check to see if the machine topology is uniform
unsigned prod = maxCt[0];
for (level = 1; level < depth; level++) {
prod *= maxCt[level];
}
bool uniform = (prod == totals[level - 1]);
// Print the machine topology summary.
if (__kmp_affinity_verbose) {
char mask[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (uniform) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
__kmp_str_buf_print(&buf, "%d", totals[0]);
for (level = 1; level <= pkgLevel; level++) {
__kmp_str_buf_print(&buf, " x %d", maxCt[level]);
}
KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
__kmp_str_buf_free(&buf);
}
KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
__kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
for (proc = 0; (int)proc < nApics; ++proc) {
__kmp_pu_os_idx[proc] = retval[proc].second;
}
if (__kmp_affinity_type == affinity_none) {
__kmp_free(last);
__kmp_free(maxCt);
__kmp_free(counts);
__kmp_free(totals);
__kmp_free(retval);
KMP_CPU_FREE(oldMask);
return 0;
}
// Find any levels with radiix 1, and remove them from the map
// (except for the package level).
int new_depth = 0;
for (level = 0; level < depth; level++) {
if ((maxCt[level] == 1) && (level != pkgLevel)) {
continue;
}
new_depth++;
}
// If we are removing any levels, allocate a new vector to return,
// and copy the relevant information to it.
if (new_depth != depth) {
AddrUnsPair *new_retval =
(AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
for (proc = 0; (int)proc < nApics; proc++) {
Address addr(new_depth);
new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
}
int new_level = 0;
int newPkgLevel = -1;
int newCoreLevel = -1;
int newThreadLevel = -1;
for (level = 0; level < depth; level++) {
if ((maxCt[level] == 1) && (level != pkgLevel)) {
// Remove this level. Never remove the package level
continue;
}
if (level == pkgLevel) {
newPkgLevel = new_level;
}
if (level == coreLevel) {
newCoreLevel = new_level;
}
if (level == threadLevel) {
newThreadLevel = new_level;
}
for (proc = 0; (int)proc < nApics; proc++) {
new_retval[proc].first.labels[new_level] =
retval[proc].first.labels[level];
}
new_level++;
}
__kmp_free(retval);
retval = new_retval;
depth = new_depth;
pkgLevel = newPkgLevel;
coreLevel = newCoreLevel;
threadLevel = newThreadLevel;
}
if (__kmp_affinity_gran_levels < 0) {
// Set the granularity level based on what levels are modeled
// in the machine topology map.
__kmp_affinity_gran_levels = 0;
if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
__kmp_affinity_gran_levels++;
}
if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
__kmp_affinity_gran_levels++;
}
if (__kmp_affinity_gran > affinity_gran_package) {
__kmp_affinity_gran_levels++;
}
}
if (__kmp_affinity_verbose) {
__kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
threadLevel);
}
__kmp_free(last);
__kmp_free(maxCt);
__kmp_free(counts);
__kmp_free(totals);
KMP_CPU_FREE(oldMask);
*address2os = retval;
return depth;
}
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
#define osIdIndex 0
#define threadIdIndex 1
#define coreIdIndex 2
#define pkgIdIndex 3
#define nodeIdIndex 4
typedef unsigned *ProcCpuInfo;
static unsigned maxIndex = pkgIdIndex;
static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
const void *b) {
unsigned i;
const unsigned *aa = *(unsigned *const *)a;
const unsigned *bb = *(unsigned *const *)b;
for (i = maxIndex;; i--) {
if (aa[i] < bb[i])
return -1;
if (aa[i] > bb[i])
return 1;
if (i == osIdIndex)
break;
}
return 0;
}
#if KMP_USE_HIER_SCHED
// Set the array sizes for the hierarchy layers
static void __kmp_dispatch_set_hierarchy_values() {
// Set the maximum number of L1's to number of cores
// Set the maximum number of L2's to to either number of cores / 2 for
// Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
// Or the number of cores for Intel(R) Xeon(R) processors
// Set the maximum number of NUMA nodes and L3's to number of packages
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
if (__kmp_mic_type >= mic3)
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
else
#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
__kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
// Set the number of threads per unit
// Number of hardware threads per L1/L2/L3/NUMA/LOOP
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
__kmp_nThreadsPerCore;
#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
if (__kmp_mic_type >= mic3)
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2 * __kmp_nThreadsPerCore;
else
#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
__kmp_nThreadsPerCore;
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
nCoresPerPkg * __kmp_nThreadsPerCore;
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
nCoresPerPkg * __kmp_nThreadsPerCore;
__kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
}
// Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
// i.e., this thread's L1 or this thread's L2, etc.
int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
int index = type + 1;
int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
if (type == kmp_hier_layer_e::LAYER_THREAD)
return tid;
else if (type == kmp_hier_layer_e::LAYER_LOOP)
return 0;
KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
if (tid >= num_hw_threads)
tid = tid % num_hw_threads;
return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
}
// Return the number of t1's per t2
int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
int i1 = t1 + 1;
int i2 = t2 + 1;
KMP_DEBUG_ASSERT(i1 <= i2);
KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
// (nthreads/t2) / (nthreads/t1) = t1 / t2
return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
}
#endif // KMP_USE_HIER_SCHED
// Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
// affinity map.
static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
int *line,
kmp_i18n_id_t *const msg_id,
FILE *f) {
*address2os = NULL;
*msg_id = kmp_i18n_null;
// Scan of the file, and count the number of "processor" (osId) fields,
// and find the highest value of <n> for a node_<n> field.
char buf[256];
unsigned num_records = 0;
while (!feof(f)) {
buf[sizeof(buf) - 1] = 1;
if (!fgets(buf, sizeof(buf), f)) {
// Read errors presumably because of EOF
break;
}
char s1[] = "processor";
if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
num_records++;
continue;
}
// FIXME - this will match "node_<n> <garbage>"
unsigned level;
if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
if (nodeIdIndex + level >= maxIndex) {
maxIndex = nodeIdIndex + level;
}
continue;
}
}
// Check for empty file / no valid processor records, or too many. The number
// of records can't exceed the number of valid bits in the affinity mask.
if (num_records == 0) {
*line = 0;
*msg_id = kmp_i18n_str_NoProcRecords;
return -1;
}
if (num_records > (unsigned)__kmp_xproc) {
*line = 0;
*msg_id = kmp_i18n_str_TooManyProcRecords;
return -1;
}
// Set the file pointer back to the begginning, so that we can scan the file
// again, this time performing a full parse of the data. Allocate a vector of
// ProcCpuInfo object, where we will place the data. Adding an extra element
// at the end allows us to remove a lot of extra checks for termination
// conditions.
if (fseek(f, 0, SEEK_SET) != 0) {
*line = 0;
*msg_id = kmp_i18n_str_CantRewindCpuinfo;
return -1;
}
// Allocate the array of records to store the proc info in. The dummy
// element at the end makes the logic in filling them out easier to code.
unsigned **threadInfo =
(unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
unsigned i;
for (i = 0; i <= num_records; i++) {
threadInfo[i] =
(unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
}
#define CLEANUP_THREAD_INFO \
for (i = 0; i <= num_records; i++) { \
__kmp_free(threadInfo[i]); \
} \
__kmp_free(threadInfo);
// A value of UINT_MAX means that we didn't find the field
unsigned __index;
#define INIT_PROC_INFO(p) \
for (__index = 0; __index <= maxIndex; __index++) { \
(p)[__index] = UINT_MAX; \
}
for (i = 0; i <= num_records; i++) {
INIT_PROC_INFO(threadInfo[i]);
}
unsigned num_avail = 0;
*line = 0;
while (!feof(f)) {
// Create an inner scoping level, so that all the goto targets at the end of
// the loop appear in an outer scoping level. This avoids warnings about
// jumping past an initialization to a target in the same block.
{
buf[sizeof(buf) - 1] = 1;
bool long_line = false;
if (!fgets(buf, sizeof(buf), f)) {
// Read errors presumably because of EOF
// If there is valid data in threadInfo[num_avail], then fake
// a blank line in ensure that the last address gets parsed.
bool valid = false;
for (i = 0; i <= maxIndex; i++) {
if (threadInfo[num_avail][i] != UINT_MAX) {
valid = true;
}
}
if (!valid) {
break;
}
buf[0] = 0;
} else if (!buf[sizeof(buf) - 1]) {
// The line is longer than the buffer. Set a flag and don't
// emit an error if we were going to ignore the line, anyway.
long_line = true;
#define CHECK_LINE \
if (long_line) { \
CLEANUP_THREAD_INFO; \
*msg_id = kmp_i18n_str_LongLineCpuinfo; \
return -1; \
}
}
(*line)++;
char s1[] = "processor";
if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
CHECK_LINE;
char *p = strchr(buf + sizeof(s1) - 1, ':');
unsigned val;
if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
goto no_val;
if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
#if KMP_ARCH_AARCH64
// Handle the old AArch64 /proc/cpuinfo layout differently,
// it contains all of the 'processor' entries listed in a
// single 'Processor' section, therefore the normal looking
// for duplicates in that section will always fail.
num_avail++;
#else
goto dup_field;
#endif
threadInfo[num_avail][osIdIndex] = val;
#if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
char path[256];
KMP_SNPRINTF(
path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
threadInfo[num_avail][osIdIndex]);
__kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
KMP_SNPRINTF(path, sizeof(path),
"/sys/devices/system/cpu/cpu%u/topology/core_id",
threadInfo[num_avail][osIdIndex]);
__kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
continue;
#else
}
char s2[] = "physical id";
if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
CHECK_LINE;
char *p = strchr(buf + sizeof(s2) - 1, ':');
unsigned val;
if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
goto no_val;
if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
goto dup_field;
threadInfo[num_avail][pkgIdIndex] = val;
continue;
}
char s3[] = "core id";
if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
CHECK_LINE;
char *p = strchr(buf + sizeof(s3) - 1, ':');
unsigned val;
if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
goto no_val;
if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
goto dup_field;
threadInfo[num_avail][coreIdIndex] = val;
continue;
#endif // KMP_OS_LINUX && USE_SYSFS_INFO
}
char s4[] = "thread id";
if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
CHECK_LINE;
char *p = strchr(buf + sizeof(s4) - 1, ':');
unsigned val;
if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
goto no_val;
if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
goto dup_field;
threadInfo[num_avail][threadIdIndex] = val;
continue;
}
unsigned level;
if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
CHECK_LINE;
char *p = strchr(buf + sizeof(s4) - 1, ':');
unsigned val;
if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
goto no_val;
KMP_ASSERT(nodeIdIndex + level <= maxIndex);
if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
goto dup_field;
threadInfo[num_avail][nodeIdIndex + level] = val;
continue;
}
// We didn't recognize the leading token on the line. There are lots of
// leading tokens that we don't recognize - if the line isn't empty, go on
// to the next line.
if ((*buf != 0) && (*buf != '\n')) {
// If the line is longer than the buffer, read characters
// until we find a newline.
if (long_line) {
int ch;
while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
;
}
continue;
}
// A newline has signalled the end of the processor record.
// Check that there aren't too many procs specified.
if ((int)num_avail == __kmp_xproc) {
CLEANUP_THREAD_INFO;
*msg_id = kmp_i18n_str_TooManyEntries;
return -1;
}
// Check for missing fields. The osId field must be there, and we
// currently require that the physical id field is specified, also.
if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
CLEANUP_THREAD_INFO;
*msg_id = kmp_i18n_str_MissingProcField;
return -1;
}
if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
CLEANUP_THREAD_INFO;
*msg_id = kmp_i18n_str_MissingPhysicalIDField;
return -1;
}
// Skip this proc if it is not included in the machine model.
if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
__kmp_affin_fullMask)) {
INIT_PROC_INFO(threadInfo[num_avail]);
continue;
}
// We have a successful parse of this proc's info.
// Increment the counter, and prepare for the next proc.
num_avail++;
KMP_ASSERT(num_avail <= num_records);
INIT_PROC_INFO(threadInfo[num_avail]);
}
continue;
no_val:
CLEANUP_THREAD_INFO;
*msg_id = kmp_i18n_str_MissingValCpuinfo;
return -1;
dup_field:
CLEANUP_THREAD_INFO;
*msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
return -1;
}
*line = 0;
#if KMP_MIC && REDUCE_TEAM_SIZE
unsigned teamSize = 0;
#endif // KMP_MIC && REDUCE_TEAM_SIZE
// check for num_records == __kmp_xproc ???
// If there's only one thread context to bind to, form an Address object with
// depth 1 and return immediately (or, if affinity is off, set address2os to
// NULL and return).
//
// If it is configured to omit the package level when there is only a single
// package, the logic at the end of this routine won't work if there is only a
// single thread - it would try to form an Address object with depth 0.
KMP_ASSERT(num_avail > 0);
KMP_ASSERT(num_avail <= num_records);
if (num_avail == 1) {
__kmp_ncores = 1;
__kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
if (__kmp_affinity_verbose) {
if (!KMP_AFFINITY_CAPABLE()) {
KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
__kmp_affin_fullMask);
KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
KMP_INFORM(Uniform, "KMP_AFFINITY");
}
int index;
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
__kmp_str_buf_print(&buf, "1");
for (index = maxIndex - 1; index > pkgIdIndex; index--) {
__kmp_str_buf_print(&buf, " x 1");
}
KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
__kmp_str_buf_free(&buf);
}
if (__kmp_affinity_type == affinity_none) {
CLEANUP_THREAD_INFO;
return 0;
}
*address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
Address addr(1);
addr.labels[0] = threadInfo[0][pkgIdIndex];
(*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
if (__kmp_affinity_gran_levels < 0) {
__kmp_affinity_gran_levels = 0;
}
if (__kmp_affinity_verbose) {
__kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
}
CLEANUP_THREAD_INFO;
return 1;
}
// Sort the threadInfo table by physical Id.
qsort(threadInfo, num_avail, sizeof(*threadInfo),
__kmp_affinity_cmp_ProcCpuInfo_phys_id);
// The table is now sorted by pkgId / coreId / threadId, but we really don't
// know the radix of any of the fields. pkgId's may be sparsely assigned among
// the chips on a system. Although coreId's are usually assigned
// [0 .. coresPerPkg-1] and threadId's are usually assigned
// [0..threadsPerCore-1], we don't want to make any such assumptions.
//
// For that matter, we don't know what coresPerPkg and threadsPerCore (or the
// total # packages) are at this point - we want to determine that now. We
// only have an upper bound on the first two figures.
unsigned *counts =
(unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
unsigned *maxCt =
(unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
unsigned *totals =
(unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
unsigned *lastId =
(unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
bool assign_thread_ids = false;
unsigned threadIdCt;
unsigned index;
restart_radix_check:
threadIdCt = 0;
// Initialize the counter arrays with data from threadInfo[0].
if (assign_thread_ids) {
if (threadInfo[0][threadIdIndex] == UINT_MAX) {
threadInfo[0][threadIdIndex] = threadIdCt++;
} else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
threadIdCt = threadInfo[0][threadIdIndex] + 1;
}
}
for (index = 0; index <= maxIndex; index++) {
counts[index] = 1;
maxCt[index] = 1;
totals[index] = 1;
lastId[index] = threadInfo[0][index];
;
}
// Run through the rest of the OS procs.
for (i = 1; i < num_avail; i++) {
// Find the most significant index whose id differs from the id for the
// previous OS proc.
for (index = maxIndex; index >= threadIdIndex; index--) {
if (assign_thread_ids && (index == threadIdIndex)) {
// Auto-assign the thread id field if it wasn't specified.
if (threadInfo[i][threadIdIndex] == UINT_MAX) {
threadInfo[i][threadIdIndex] = threadIdCt++;
}
// Apparently the thread id field was specified for some entries and not
// others. Start the thread id counter off at the next higher thread id.
else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
threadIdCt = threadInfo[i][threadIdIndex] + 1;
}
}
if (threadInfo[i][index] != lastId[index]) {
// Run through all indices which are less significant, and reset the
// counts to 1. At all levels up to and including index, we need to
// increment the totals and record the last id.
unsigned index2;
for (index2 = threadIdIndex; index2 < index; index2++) {
totals[index2]++;
if (counts[index2] > maxCt[index2]) {
maxCt[index2] = counts[index2];
}
counts[index2] = 1;
lastId[index2] = threadInfo[i][index2];
}
counts[index]++;
totals[index]++;
lastId[index] = threadInfo[i][index];
if (assign_thread_ids && (index > threadIdIndex)) {
#if KMP_MIC && REDUCE_TEAM_SIZE
// The default team size is the total #threads in the machine
// minus 1 thread for every core that has 3 or more threads.
teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
#endif // KMP_MIC && REDUCE_TEAM_SIZE
// Restart the thread counter, as we are on a new core.
threadIdCt = 0;
// Auto-assign the thread id field if it wasn't specified.
if (threadInfo[i][threadIdIndex] == UINT_MAX) {
threadInfo[i][threadIdIndex] = threadIdCt++;
}
// Aparrently the thread id field was specified for some entries and
// not others. Start the thread id counter off at the next higher
// thread id.
else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
threadIdCt = threadInfo[i][threadIdIndex] + 1;
}
}
break;
}
}
if (index < threadIdIndex) {
// If thread ids were specified, it is an error if they are not unique.
// Also, check that we waven't already restarted the loop (to be safe -
// shouldn't need to).
if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
__kmp_free(lastId);
__kmp_free(totals);
__kmp_free(maxCt);
__kmp_free(counts);
CLEANUP_THREAD_INFO;
*msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
return -1;
}
// If the thread ids were not specified and we see entries entries that
// are duplicates, start the loop over and assign the thread ids manually.
assign_thread_ids = true;
goto restart_radix_check;
}
}
#if KMP_MIC && REDUCE_TEAM_SIZE
// The default team size is the total #threads in the machine
// minus 1 thread for every core that has 3 or more threads.
teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
#endif // KMP_MIC && REDUCE_TEAM_SIZE
for (index = threadIdIndex; index <= maxIndex; index++) {
if (counts[index] > maxCt[index]) {
maxCt[index] = counts[index];
}
}
__kmp_nThreadsPerCore = maxCt[threadIdIndex];
nCoresPerPkg = maxCt[coreIdIndex];
nPackages = totals[pkgIdIndex];
// Check to see if the machine topology is uniform
unsigned prod = totals[maxIndex];
for (index = threadIdIndex; index < maxIndex; index++) {
prod *= maxCt[index];
}
bool uniform = (prod == totals[threadIdIndex]);
// When affinity is off, this routine will still be called to set
// __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
// Make sure all these vars are set correctly, and return now if affinity is
// not enabled.
__kmp_ncores = totals[coreIdIndex];
if (__kmp_affinity_verbose) {
if (!KMP_AFFINITY_CAPABLE()) {
KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (uniform) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
} else {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
__kmp_affin_fullMask);
KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
}
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (uniform) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
}
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
__kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
__kmp_str_buf_print(&buf, " x %d", maxCt[index]);
}
KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
maxCt[threadIdIndex], __kmp_ncores);
__kmp_str_buf_free(&buf);
}
#if KMP_MIC && REDUCE_TEAM_SIZE
// Set the default team size.
if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
__kmp_dflt_team_nth = teamSize;
KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
"__kmp_dflt_team_nth = %d\n",
__kmp_dflt_team_nth));
}
#endif // KMP_MIC && REDUCE_TEAM_SIZE
KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
__kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
for (i = 0; i < num_avail; ++i) { // fill the os indices
__kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
}
if (__kmp_affinity_type == affinity_none) {
__kmp_free(lastId);
__kmp_free(totals);
__kmp_free(maxCt);
__kmp_free(counts);
CLEANUP_THREAD_INFO;
return 0;
}
// Count the number of levels which have more nodes at that level than at the
// parent's level (with there being an implicit root node of the top level).
// This is equivalent to saying that there is at least one node at this level
// which has a sibling. These levels are in the map, and the package level is
// always in the map.
bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
for (index = threadIdIndex; index < maxIndex; index++) {
KMP_ASSERT(totals[index] >= totals[index + 1]);
inMap[index] = (totals[index] > totals[index + 1]);
}
inMap[maxIndex] = (totals[maxIndex] > 1);
inMap[pkgIdIndex] = true;
int depth = 0;
for (index = threadIdIndex; index <= maxIndex; index++) {
if (inMap[index]) {
depth++;
}
}
KMP_ASSERT(depth > 0);
// Construct the data structure that is to be returned.
*address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
int pkgLevel = -1;
int coreLevel = -1;
int threadLevel = -1;
for (i = 0; i < num_avail; ++i) {
Address addr(depth);
unsigned os = threadInfo[i][osIdIndex];
int src_index;
int dst_index = 0;
for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
if (!inMap[src_index]) {
continue;
}
addr.labels[dst_index] = threadInfo[i][src_index];
if (src_index == pkgIdIndex) {
pkgLevel = dst_index;
} else if (src_index == coreIdIndex) {
coreLevel = dst_index;
} else if (src_index == threadIdIndex) {
threadLevel = dst_index;
}
dst_index++;
}
(*address2os)[i] = AddrUnsPair(addr, os);
}
if (__kmp_affinity_gran_levels < 0) {
// Set the granularity level based on what levels are modeled
// in the machine topology map.
unsigned src_index;
__kmp_affinity_gran_levels = 0;
for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
if (!inMap[src_index]) {
continue;
}
switch (src_index) {
case threadIdIndex:
if (__kmp_affinity_gran > affinity_gran_thread) {
__kmp_affinity_gran_levels++;
}
break;
case coreIdIndex:
if (__kmp_affinity_gran > affinity_gran_core) {
__kmp_affinity_gran_levels++;
}
break;
case pkgIdIndex:
if (__kmp_affinity_gran > affinity_gran_package) {
__kmp_affinity_gran_levels++;
}
break;
}
}
}
if (__kmp_affinity_verbose) {
__kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
coreLevel, threadLevel);
}
__kmp_free(inMap);
__kmp_free(lastId);
__kmp_free(totals);
__kmp_free(maxCt);
__kmp_free(counts);
CLEANUP_THREAD_INFO;
return depth;
}
// Create and return a table of affinity masks, indexed by OS thread ID.
// This routine handles OR'ing together all the affinity masks of threads
// that are sufficiently close, if granularity > fine.
static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
unsigned *numUnique,
AddrUnsPair *address2os,
unsigned numAddrs) {
// First form a table of affinity masks in order of OS thread id.
unsigned depth;
unsigned maxOsId;
unsigned i;
KMP_ASSERT(numAddrs > 0);
depth = address2os[0].first.depth;
maxOsId = 0;
for (i = numAddrs - 1;; --i) {
unsigned osId = address2os[i].second;
if (osId > maxOsId) {
maxOsId = osId;
}
if (i == 0)
break;
}
kmp_affin_mask_t *osId2Mask;
KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
// Sort the address2os table according to physical order. Doing so will put
// all threads on the same core/package/node in consecutive locations.
qsort(address2os, numAddrs, sizeof(*address2os),
__kmp_affinity_cmp_Address_labels);
KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
}
if (__kmp_affinity_gran_levels >= (int)depth) {
if (__kmp_affinity_verbose ||
(__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
KMP_WARNING(AffThreadsMayMigrate);
}
}
// Run through the table, forming the masks for all threads on each core.
// Threads on the same core will have identical "Address" objects, not
// considering the last level, which must be the thread id. All threads on a
// core will appear consecutively.
unsigned unique = 0;
unsigned j = 0; // index of 1st thread on core
unsigned leader = 0;
Address *leaderAddr = &(address2os[0].first);
kmp_affin_mask_t *sum;
KMP_CPU_ALLOC_ON_STACK(sum);
KMP_CPU_ZERO(sum);
KMP_CPU_SET(address2os[0].second, sum);
for (i = 1; i < numAddrs; i++) {
// If this thread is sufficiently close to the leader (within the
// granularity setting), then set the bit for this os thread in the
// affinity mask for this group, and go on to the next thread.
if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
KMP_CPU_SET(address2os[i].second, sum);
continue;
}
// For every thread in this group, copy the mask to the thread's entry in
// the osId2Mask table. Mark the first address as a leader.
for (; j < i; j++) {
unsigned osId = address2os[j].second;
KMP_DEBUG_ASSERT(osId <= maxOsId);
kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
KMP_CPU_COPY(mask, sum);
address2os[j].first.leader = (j == leader);
}
unique++;
// Start a new mask.
leader = i;
leaderAddr = &(address2os[i].first);
KMP_CPU_ZERO(sum);
KMP_CPU_SET(address2os[i].second, sum);
}
// For every thread in last group, copy the mask to the thread's
// entry in the osId2Mask table.
for (; j < i; j++) {
unsigned osId = address2os[j].second;
KMP_DEBUG_ASSERT(osId <= maxOsId);
kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
KMP_CPU_COPY(mask, sum);
address2os[j].first.leader = (j == leader);
}
unique++;
KMP_CPU_FREE_FROM_STACK(sum);
*maxIndex = maxOsId;
*numUnique = unique;
return osId2Mask;
}
// Stuff for the affinity proclist parsers. It's easier to declare these vars
// as file-static than to try and pass them through the calling sequence of
// the recursive-descent OMP_PLACES parser.
static kmp_affin_mask_t *newMasks;
static int numNewMasks;
static int nextNewMask;
#define ADD_MASK(_mask) \
{ \
if (nextNewMask >= numNewMasks) { \
int i; \
numNewMasks *= 2; \
kmp_affin_mask_t *temp; \
KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
for (i = 0; i < numNewMasks / 2; i++) { \
kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
KMP_CPU_COPY(dest, src); \
} \
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
newMasks = temp; \
} \
KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
nextNewMask++; \
}
#define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
{ \
if (((_osId) > _maxOsId) || \
(!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
if (__kmp_affinity_verbose || \
(__kmp_affinity_warnings && \
(__kmp_affinity_type != affinity_none))) { \
KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
} \
} else { \
ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
} \
}
// Re-parse the proclist (for the explicit affinity type), and form the list
// of affinity newMasks indexed by gtid.
static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
unsigned int *out_numMasks,
const char *proclist,
kmp_affin_mask_t *osId2Mask,
int maxOsId) {
int i;
const char *scan = proclist;
const char *next = proclist;
// We use malloc() for the temporary mask vector, so that we can use
// realloc() to extend it.
numNewMasks = 2;
KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
nextNewMask = 0;
kmp_affin_mask_t *sumMask;
KMP_CPU_ALLOC(sumMask);
int setSize = 0;
for (;;) {
int start, end, stride;
SKIP_WS(scan);
next = scan;
if (*next == '\0') {
break;
}
if (*next == '{') {
int num;
setSize = 0;
next++; // skip '{'
SKIP_WS(next);
scan = next;
// Read the first integer in the set.
KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
SKIP_DIGITS(next);
num = __kmp_str_to_int(scan, *next);
KMP_ASSERT2(num >= 0, "bad explicit proc list");
// Copy the mask for that osId to the sum (union) mask.
if ((num > maxOsId) ||
(!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
if (__kmp_affinity_verbose ||
(__kmp_affinity_warnings &&
(__kmp_affinity_type != affinity_none))) {
KMP_WARNING(AffIgnoreInvalidProcID, num);
}
KMP_CPU_ZERO(sumMask);
} else {
KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
setSize = 1;
}
for (;;) {
// Check for end of set.
SKIP_WS(next);
if (*next == '}') {
next++; // skip '}'
break;
}
// Skip optional comma.
if (*next == ',') {
next++;
}
SKIP_WS(next);
// Read the next integer in the set.
scan = next;
KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
SKIP_DIGITS(next);
num = __kmp_str_to_int(scan, *next);
KMP_ASSERT2(num >= 0, "bad explicit proc list");
// Add the mask for that osId to the sum mask.
if ((num > maxOsId) ||
(!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
if (__kmp_affinity_verbose ||
(__kmp_affinity_warnings &&
(__kmp_affinity_type != affinity_none))) {
KMP_WARNING(AffIgnoreInvalidProcID, num);
}
} else {
KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
setSize++;
}
}
if (setSize > 0) {
ADD_MASK(sumMask);
}
SKIP_WS(next);
if (*next == ',') {
next++;
}
scan = next;
continue;
}
// Read the first integer.
KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
SKIP_DIGITS(next);
start = __kmp_str_to_int(scan, *next);
KMP_ASSERT2(start >= 0, "bad explicit proc list");
SKIP_WS(next);
// If this isn't a range, then add a mask to the list and go on.
if (*next != '-') {
ADD_MASK_OSID(start, osId2Mask, maxOsId);
// Skip optional comma.
if (*next == ',') {
next++;
}
scan = next;
continue;
}
// This is a range. Skip over the '-' and read in the 2nd int.
next++; // skip '-'
SKIP_WS(next);
scan = next;
KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
SKIP_DIGITS(next);
end = __kmp_str_to_int(scan, *next);
KMP_ASSERT2(end >= 0, "bad explicit proc list");
// Check for a stride parameter
stride = 1;
SKIP_WS(next);
if (*next == ':') {
// A stride is specified. Skip over the ':" and read the 3rd int.
int sign = +1;
next++; // skip ':'
SKIP_WS(next);
scan = next;
if (*next == '-') {
sign = -1;
next++;
SKIP_WS(next);
scan = next;
}
KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
SKIP_DIGITS(next);
stride = __kmp_str_to_int(scan, *next);
KMP_ASSERT2(stride >= 0, "bad explicit proc list");
stride *= sign;
}
// Do some range checks.
KMP_ASSERT2(stride != 0, "bad explicit proc list");
if (stride > 0) {
KMP_ASSERT2(start <= end, "bad explicit proc list");
} else {
KMP_ASSERT2(start >= end, "bad explicit proc list");
}
KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
// Add the mask for each OS proc # to the list.
if (stride > 0) {
do {
ADD_MASK_OSID(start, osId2Mask, maxOsId);
start += stride;
} while (start <= end);
} else {
do {
ADD_MASK_OSID(start, osId2Mask, maxOsId);
start += stride;
} while (start >= end);
}
// Skip optional comma.
SKIP_WS(next);
if (*next == ',') {
next++;
}
scan = next;
}
*out_numMasks = nextNewMask;
if (nextNewMask == 0) {
*out_masks = NULL;
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
return;
}
KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
for (i = 0; i < nextNewMask; i++) {
kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
KMP_CPU_COPY(dest, src);
}
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
KMP_CPU_FREE(sumMask);
}
#if OMP_40_ENABLED
/*-----------------------------------------------------------------------------
Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
places. Again, Here is the grammar:
place_list := place
place_list := place , place_list
place := num
place := place : num
place := place : num : signed
place := { subplacelist }
place := ! place // (lowest priority)
subplace_list := subplace
subplace_list := subplace , subplace_list
subplace := num
subplace := num : num
subplace := num : num : signed
signed := num
signed := + signed
signed := - signed
-----------------------------------------------------------------------------*/
static void __kmp_process_subplace_list(const char **scan,
kmp_affin_mask_t *osId2Mask,
int maxOsId, kmp_affin_mask_t *tempMask,
int *setSize) {
const char *next;
for (;;) {
int start, count, stride, i;
// Read in the starting proc id
SKIP_WS(*scan);
KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
next = *scan;
SKIP_DIGITS(next);
start = __kmp_str_to_int(*scan, *next);
KMP_ASSERT(start >= 0);
*scan = next;
// valid follow sets are ',' ':' and '}'
SKIP_WS(*scan);
if (**scan == '}' || **scan == ',') {
if ((start > maxOsId) ||
(!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
if (__kmp_affinity_verbose ||
(__kmp_affinity_warnings &&
(__kmp_affinity_type != affinity_none))) {
KMP_WARNING(AffIgnoreInvalidProcID, start);
}
} else {
KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
(*setSize)++;
}
if (**scan == '}') {
break;
}
(*scan)++; // skip ','
continue;
}
KMP_ASSERT2(**scan == ':', "bad explicit places list");
(*scan)++; // skip ':'
// Read count parameter
SKIP_WS(*scan);
KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
next = *scan;
SKIP_DIGITS(next);
count = __kmp_str_to_int(*scan, *next);
KMP_ASSERT(count >= 0);
*scan = next;
// valid follow sets are ',' ':' and '}'
SKIP_WS(*scan);
if (**scan == '}' || **scan == ',') {
for (i = 0; i < count; i++) {
if ((start > maxOsId) ||
(!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
if (__kmp_affinity_verbose ||
(__kmp_affinity_warnings &&
(__kmp_affinity_type != affinity_none))) {
KMP_WARNING(AffIgnoreInvalidProcID, start);
}
break; // don't proliferate warnings for large count
} else {
KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
start++;
(*setSize)++;
}
}
if (**scan == '}') {
break;
}
(*scan)++; // skip ','
continue;
}
KMP_ASSERT2(**scan == ':', "bad explicit places list");
(*scan)++; // skip ':'
// Read stride parameter
int sign = +1;
for (;;) {
SKIP_WS(*scan);
if (**scan == '+') {
(*scan)++; // skip '+'
continue;
}
if (**scan == '-') {
sign *= -1;
(*scan)++; // skip '-'
continue;
}
break;
}
SKIP_WS(*scan);
KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
next = *scan;
SKIP_DIGITS(next);
stride = __kmp_str_to_int(*scan, *next);
KMP_ASSERT(stride >= 0);
*scan = next;
stride *= sign;
// valid follow sets are ',' and '}'
SKIP_WS(*scan);
if (**scan == '}' || **scan == ',') {
for (i = 0; i < count; i++) {
if ((start > maxOsId) ||
(!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
if (__kmp_affinity_verbose ||
(__kmp_affinity_warnings &&
(__kmp_affinity_type != affinity_none))) {
KMP_WARNING(AffIgnoreInvalidProcID, start);
}
break; // don't proliferate warnings for large count
} else {
KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
start += stride;
(*setSize)++;
}
}
if (**scan == '}') {
break;
}
(*scan)++; // skip ','
continue;
}
KMP_ASSERT2(0, "bad explicit places list");
}
}
static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
int maxOsId, kmp_affin_mask_t *tempMask,
int *setSize) {
const char *next;
// valid follow sets are '{' '!' and num
SKIP_WS(*scan);
if (**scan == '{') {
(*scan)++; // skip '{'
__kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
KMP_ASSERT2(**scan == '}', "bad explicit places list");
(*scan)++; // skip '}'
} else if (**scan == '!') {
(*scan)++; // skip '!'
__kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
KMP_CPU_COMPLEMENT(maxOsId, tempMask);
} else if ((**scan >= '0') && (**scan <= '9')) {
next = *scan;
SKIP_DIGITS(next);
int num = __kmp_str_to_int(*scan, *next);
KMP_ASSERT(num >= 0);
if ((num > maxOsId) ||
(!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
if (__kmp_affinity_verbose ||
(__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
KMP_WARNING(AffIgnoreInvalidProcID, num);
}
} else {
KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
(*setSize)++;
}
*scan = next; // skip num
} else {
KMP_ASSERT2(0, "bad explicit places list");
}
}
// static void
void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
unsigned int *out_numMasks,
const char *placelist,
kmp_affin_mask_t *osId2Mask,
int maxOsId) {
int i, j, count, stride, sign;
const char *scan = placelist;
const char *next = placelist;
numNewMasks = 2;
KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
nextNewMask = 0;
// tempMask is modified based on the previous or initial
// place to form the current place
// previousMask contains the previous place
kmp_affin_mask_t *tempMask;
kmp_affin_mask_t *previousMask;
KMP_CPU_ALLOC(tempMask);
KMP_CPU_ZERO(tempMask);
KMP_CPU_ALLOC(previousMask);
KMP_CPU_ZERO(previousMask);
int setSize = 0;
for (;;) {
__kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
// valid follow sets are ',' ':' and EOL
SKIP_WS(scan);
if (*scan == '\0' || *scan == ',') {
if (setSize > 0) {
ADD_MASK(tempMask);
}
KMP_CPU_ZERO(tempMask);
setSize = 0;
if (*scan == '\0') {
break;
}
scan++; // skip ','
continue;
}
KMP_ASSERT2(*scan == ':', "bad explicit places list");
scan++; // skip ':'
// Read count parameter
SKIP_WS(scan);
KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
next = scan;
SKIP_DIGITS(next);
count = __kmp_str_to_int(scan, *next);
KMP_ASSERT(count >= 0);
scan = next;
// valid follow sets are ',' ':' and EOL
SKIP_WS(scan);
if (*scan == '\0' || *scan == ',') {
stride = +1;
} else {
KMP_ASSERT2(*scan == ':', "bad explicit places list");
scan++; // skip ':'
// Read stride parameter
sign = +1;
for (;;) {
SKIP_WS(scan);
if (*scan == '+') {
scan++; // skip '+'
continue;
}
if (*scan == '-') {
sign *= -1;
scan++; // skip '-'
continue;
}
break;
}
SKIP_WS(scan);
KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
next = scan;
SKIP_DIGITS(next);
stride = __kmp_str_to_int(scan, *next);
KMP_DEBUG_ASSERT(stride >= 0);
scan = next;
stride *= sign;
}
// Add places determined by initial_place : count : stride
for (i = 0; i < count; i++) {
if (setSize == 0) {
break;
}
// Add the current place, then build the next place (tempMask) from that
KMP_CPU_COPY(previousMask, tempMask);
ADD_MASK(previousMask);
KMP_CPU_ZERO(tempMask);
setSize = 0;
KMP_CPU_SET_ITERATE(j, previousMask) {
if (!KMP_CPU_ISSET(j, previousMask)) {
continue;
}
if ((j + stride > maxOsId) || (j + stride < 0) ||
(!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
(!KMP_CPU_ISSET(j + stride,
KMP_CPU_INDEX(osId2Mask, j + stride)))) {
if ((__kmp_affinity_verbose ||
(__kmp_affinity_warnings &&
(__kmp_affinity_type != affinity_none))) &&
i < count - 1) {
KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
}
continue;
}
KMP_CPU_SET(j + stride, tempMask);
setSize++;
}
}
KMP_CPU_ZERO(tempMask);
setSize = 0;
// valid follow sets are ',' and EOL
SKIP_WS(scan);
if (*scan == '\0') {
break;
}
if (*scan == ',') {
scan++; // skip ','
continue;
}
KMP_ASSERT2(0, "bad explicit places list");
}
*out_numMasks = nextNewMask;
if (nextNewMask == 0) {
*out_masks = NULL;
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
return;
}
KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
KMP_CPU_FREE(tempMask);
KMP_CPU_FREE(previousMask);
for (i = 0; i < nextNewMask; i++) {
kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
KMP_CPU_COPY(dest, src);
}
KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
}
#endif /* OMP_40_ENABLED */
#undef ADD_MASK
#undef ADD_MASK_OSID
#if KMP_USE_HWLOC
static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
// skip PUs descendants of the object o
int skipped = 0;
hwloc_obj_t hT = NULL;
int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
for (int i = 0; i < N; ++i) {
KMP_DEBUG_ASSERT(hT);
unsigned idx = hT->os_index;
if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
KMP_CPU_CLR(idx, __kmp_affin_fullMask);
KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
++skipped;
}
hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
}
return skipped; // count number of skipped units
}
static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
// check if obj has PUs present in fullMask
hwloc_obj_t hT = NULL;
int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
for (int i = 0; i < N; ++i) {
KMP_DEBUG_ASSERT(hT);
unsigned idx = hT->os_index;
if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
return 1; // found PU
hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
}
return 0; // no PUs found
}
#endif // KMP_USE_HWLOC
static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
AddrUnsPair *newAddr;
if (__kmp_hws_requested == 0)
goto _exit; // no topology limiting actions requested, exit
#if KMP_USE_HWLOC
if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
// Number of subobjects calculated dynamically, this works fine for
// any non-uniform topology.
// L2 cache objects are determined by depth, other objects - by type.
hwloc_topology_t tp = __kmp_hwloc_topology;
int nS = 0, nN = 0, nL = 0, nC = 0,
nT = 0; // logical index including skipped
int nCr = 0, nTr = 0; // number of requested units
int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
int L2depth, idx;
// check support of extensions ----------------------------------
int numa_support = 0, tile_support = 0;
if (__kmp_pu_os_idx)
hT = hwloc_get_pu_obj_by_os_index(tp,
__kmp_pu_os_idx[__kmp_avail_proc - 1]);
else
hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
if (hT == NULL) { // something's gone wrong
KMP_WARNING(AffHWSubsetUnsupported);
goto _exit;
}
// check NUMA node
hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
if (hN != NULL && hN->depth > hS->depth) {
numa_support = 1; // 1 in case socket includes node(s)
} else if (__kmp_hws_node.num > 0) {
// don't support sockets inside NUMA node (no such HW found for testing)
KMP_WARNING(AffHWSubsetUnsupported);
goto _exit;
}
// check L2 cahce, get object by depth because of multiple caches
L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
if (hL != NULL &&
__kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
tile_support = 1; // no sense to count L2 if it includes single core
} else if (__kmp_hws_tile.num > 0) {
if (__kmp_hws_core.num == 0) {
__kmp_hws_core = __kmp_hws_tile; // replace L2 with core
__kmp_hws_tile.num = 0;
} else {
// L2 and core are both requested, but represent same object
KMP_WARNING(AffHWSubsetInvalid);
goto _exit;
}
}
// end of check of extensions -----------------------------------
// fill in unset items, validate settings -----------------------
if (__kmp_hws_socket.num == 0)
__kmp_hws_socket.num = nPackages; // use all available sockets
if (__kmp_hws_socket.offset >= nPackages) {
KMP_WARNING(AffHWSubsetManySockets);
goto _exit;
}
if (numa_support) {
hN = NULL;
int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
&hN); // num nodes in socket
if (__kmp_hws_node.num == 0)
__kmp_hws_node.num = NN; // use all available nodes
if (__kmp_hws_node.offset >= NN) {
KMP_WARNING(AffHWSubsetManyNodes);
goto _exit;
}
if (tile_support) {
// get num tiles in node
int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
if (__kmp_hws_tile.num == 0) {
__kmp_hws_tile.num = NL + 1;
} // use all available tiles, some node may have more tiles, thus +1
if (__kmp_hws_tile.offset >= NL) {
KMP_WARNING(AffHWSubsetManyTiles);
goto _exit;
}
int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
&hC); // num cores in tile
if (__kmp_hws_core.num == 0)
__kmp_hws_core.num = NC; // use all available cores
if (__kmp_hws_core.offset >= NC) {
KMP_WARNING(AffHWSubsetManyCores);
goto _exit;
}
} else { // tile_support
int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
&hC); // num cores in node
if (__kmp_hws_core.num == 0)
__kmp_hws_core.num = NC; // use all available cores
if (__kmp_hws_core.offset >= NC) {
KMP_WARNING(AffHWSubsetManyCores);
goto _exit;
}
} // tile_support
} else { // numa_support
if (tile_support) {
// get num tiles in socket
int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
if (__kmp_hws_tile.num == 0)
__kmp_hws_tile.num = NL; // use all available tiles
if (__kmp_hws_tile.offset >= NL) {
KMP_WARNING(AffHWSubsetManyTiles);
goto _exit;
}
int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
&hC); // num cores in tile
if (__kmp_hws_core.num == 0)
__kmp_hws_core.num = NC; // use all available cores
if (__kmp_hws_core.offset >= NC) {
KMP_WARNING(AffHWSubsetManyCores);
goto _exit;
}
} else { // tile_support
int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
&hC); // num cores in socket
if (__kmp_hws_core.num == 0)
__kmp_hws_core.num = NC; // use all available cores
if (__kmp_hws_core.offset >= NC) {
KMP_WARNING(AffHWSubsetManyCores);
goto _exit;
}
} // tile_support
}
if (__kmp_hws_proc.num == 0)
__kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
KMP_WARNING(AffHWSubsetManyProcs);
goto _exit;
}
// end of validation --------------------------------------------
if (pAddr) // pAddr is NULL in case of affinity_none
newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
__kmp_avail_proc); // max size
// main loop to form HW subset ----------------------------------
hS = NULL;
int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
for (int s = 0; s < NP; ++s) {
// Check Socket -----------------------------------------------
hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
if (!__kmp_hwloc_obj_has_PUs(tp, hS))
continue; // skip socket if all PUs are out of fullMask
++nS; // only count objects those have PUs in affinity mask
if (nS <= __kmp_hws_socket.offset ||
nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
continue; // move to next socket
}
nCr = 0; // count number of cores per socket
// socket requested, go down the topology tree
// check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
if (numa_support) {
nN = 0;
hN = NULL;
// num nodes in current socket
int NN =
__kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
for (int n = 0; n < NN; ++n) {
// Check NUMA Node ----------------------------------------
if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
continue; // skip node if all PUs are out of fullMask
}
++nN;
if (nN <= __kmp_hws_node.offset ||
nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
// skip node as not requested
n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
continue; // move to next node
}
// node requested, go down the topology tree
if (tile_support) {
nL = 0;
hL = NULL;
int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
for (int l = 0; l < NL; ++l) {
// Check L2 (tile) ------------------------------------
if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
continue; // skip tile if all PUs are out of fullMask
}
++nL;
if (nL <= __kmp_hws_tile.offset ||
nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
// skip tile as not requested
n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
continue; // move to next tile
}
// tile requested, go down the topology tree
nC = 0;
hC = NULL;
// num cores in current tile
int NC = __kmp_hwloc_count_children_by_type(tp, hL,
HWLOC_OBJ_CORE, &hC);
for (int c = 0; c < NC; ++c) {
// Check Core ---------------------------------------
if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
continue; // skip core if all PUs are out of fullMask
}
++nC;
if (nC <= __kmp_hws_core.offset ||
nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
// skip node as not requested
n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
continue; // move to next node
}
// core requested, go down to PUs
nT = 0;
nTr = 0;
hT = NULL;
// num procs in current core
int NT = __kmp_hwloc_count_children_by_type(tp, hC,
HWLOC_OBJ_PU, &hT);
for (int t = 0; t < NT; ++t) {
// Check PU ---------------------------------------
idx = hT->os_index;
if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
continue; // skip PU if not in fullMask
}
++nT;
if (nT <= __kmp_hws_proc.offset ||
nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
// skip PU
KMP_CPU_CLR(idx, __kmp_affin_fullMask);
++n_old;
KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
continue; // move to next node
}
++nTr;
if (pAddr) // collect requested thread's data
newAddr[n_new] = (*pAddr)[n_old];
++n_new;
++n_old;
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
} // threads loop
if (nTr > 0) {
++nCr; // num cores per socket
++nCo; // total num cores
if (nTr > nTpC)
nTpC = nTr; // calc max threads per core
}
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
} // cores loop
hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
} // tiles loop
} else { // tile_support
// no tiles, check cores
nC = 0;
hC = NULL;
// num cores in current node
int NC =
__kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
for (int c = 0; c < NC; ++c) {
// Check Core ---------------------------------------
if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
continue; // skip core if all PUs are out of fullMask
}
++nC;
if (nC <= __kmp_hws_core.offset ||
nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
// skip node as not requested
n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
continue; // move to next node
}
// core requested, go down to PUs
nT = 0;
nTr = 0;
hT = NULL;
int NT =
__kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
for (int t = 0; t < NT; ++t) {
// Check PU ---------------------------------------
idx = hT->os_index;
if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
continue; // skip PU if not in fullMask
}
++nT;
if (nT <= __kmp_hws_proc.offset ||
nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
// skip PU
KMP_CPU_CLR(idx, __kmp_affin_fullMask);
++n_old;
KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
continue; // move to next node
}
++nTr;
if (pAddr) // collect requested thread's data
newAddr[n_new] = (*pAddr)[n_old];
++n_new;
++n_old;
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
} // threads loop
if (nTr > 0) {
++nCr; // num cores per socket
++nCo; // total num cores
if (nTr > nTpC)
nTpC = nTr; // calc max threads per core
}
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
} // cores loop
} // tiles support
hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
} // nodes loop
} else { // numa_support
// no NUMA support
if (tile_support) {
nL = 0;
hL = NULL;
// num tiles in current socket
int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
for (int l = 0; l < NL; ++l) {
// Check L2 (tile) ------------------------------------
if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
continue; // skip tile if all PUs are out of fullMask
}
++nL;
if (nL <= __kmp_hws_tile.offset ||
nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
// skip tile as not requested
n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
continue; // move to next tile
}
// tile requested, go down the topology tree
nC = 0;
hC = NULL;
// num cores per tile
int NC =
__kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
for (int c = 0; c < NC; ++c) {
// Check Core ---------------------------------------
if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
continue; // skip core if all PUs are out of fullMask
}
++nC;
if (nC <= __kmp_hws_core.offset ||
nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
// skip node as not requested
n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
continue; // move to next node
}
// core requested, go down to PUs
nT = 0;
nTr = 0;
hT = NULL;
// num procs per core
int NT =
__kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
for (int t = 0; t < NT; ++t) {
// Check PU ---------------------------------------
idx = hT->os_index;
if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
continue; // skip PU if not in fullMask
}
++nT;
if (nT <= __kmp_hws_proc.offset ||
nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
// skip PU
KMP_CPU_CLR(idx, __kmp_affin_fullMask);
++n_old;
KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
continue; // move to next node
}
++nTr;
if (pAddr) // collect requested thread's data
newAddr[n_new] = (*pAddr)[n_old];
++n_new;
++n_old;
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
} // threads loop
if (nTr > 0) {
++nCr; // num cores per socket
++nCo; // total num cores
if (nTr > nTpC)
nTpC = nTr; // calc max threads per core
}
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
} // cores loop
hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
} // tiles loop
} else { // tile_support
// no tiles, check cores
nC = 0;
hC = NULL;
// num cores in socket
int NC =
__kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
for (int c = 0; c < NC; ++c) {
// Check Core -------------------------------------------
if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
continue; // skip core if all PUs are out of fullMask
}
++nC;
if (nC <= __kmp_hws_core.offset ||
nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
// skip node as not requested
n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
continue; // move to next node
}
// core requested, go down to PUs
nT = 0;
nTr = 0;
hT = NULL;
// num procs per core
int NT =
__kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
for (int t = 0; t < NT; ++t) {
// Check PU ---------------------------------------
idx = hT->os_index;
if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
continue; // skip PU if not in fullMask
}
++nT;
if (nT <= __kmp_hws_proc.offset ||
nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
// skip PU
KMP_CPU_CLR(idx, __kmp_affin_fullMask);
++n_old;
KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
continue; // move to next node
}
++nTr;
if (pAddr) // collect requested thread's data
newAddr[n_new] = (*pAddr)[n_old];
++n_new;
++n_old;
hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
} // threads loop
if (nTr > 0) {
++nCr; // num cores per socket
++nCo; // total num cores
if (nTr > nTpC)
nTpC = nTr; // calc max threads per core
}
hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
} // cores loop
} // tiles support
} // numa_support
if (nCr > 0) { // found cores?
++nPkg; // num sockets
if (nCr > nCpP)
nCpP = nCr; // calc max cores per socket
}
} // sockets loop
// check the subset is valid
KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
KMP_DEBUG_ASSERT(nPkg > 0);
KMP_DEBUG_ASSERT(nCpP > 0);
KMP_DEBUG_ASSERT(nTpC > 0);
KMP_DEBUG_ASSERT(nCo > 0);
KMP_DEBUG_ASSERT(nPkg <= nPackages);
KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
nPackages = nPkg; // correct num sockets
nCoresPerPkg = nCpP; // correct num cores per socket
__kmp_nThreadsPerCore = nTpC; // correct num threads per core
__kmp_avail_proc = n_new; // correct num procs
__kmp_ncores = nCo; // correct num cores
// hwloc topology method end
} else
#endif // KMP_USE_HWLOC
{
int n_old = 0, n_new = 0, proc_num = 0;
if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
KMP_WARNING(AffHWSubsetNoHWLOC);
goto _exit;
}
if (__kmp_hws_socket.num == 0)
__kmp_hws_socket.num = nPackages; // use all available sockets
if (__kmp_hws_core.num == 0)
__kmp_hws_core.num = nCoresPerPkg; // use all available cores
if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
__kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
if (!__kmp_affinity_uniform_topology()) {
KMP_WARNING(AffHWSubsetNonUniform);
goto _exit; // don't support non-uniform topology
}
if (depth > 3) {
KMP_WARNING(AffHWSubsetNonThreeLevel);
goto _exit; // don't support not-3-level topology
}
if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
KMP_WARNING(AffHWSubsetManySockets);
goto _exit;
}
if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
KMP_WARNING(AffHWSubsetManyCores);
goto _exit;
}
// Form the requested subset
if (pAddr) // pAddr is NULL in case of affinity_none
newAddr = (AddrUnsPair *)__kmp_allocate(
sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
__kmp_hws_proc.num);
for (int i = 0; i < nPackages; ++i) {
if (i < __kmp_hws_socket.offset ||
i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
// skip not-requested socket
n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
if (__kmp_pu_os_idx != NULL) {
// walk through skipped socket
for (int j = 0; j < nCoresPerPkg; ++j) {
for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
++proc_num;
}
}
}
} else {
// walk through requested socket
for (int j = 0; j < nCoresPerPkg; ++j) {
if (j < __kmp_hws_core.offset ||
j >= __kmp_hws_core.offset +
__kmp_hws_core.num) { // skip not-requested core
n_old += __kmp_nThreadsPerCore;
if (__kmp_pu_os_idx != NULL) {
for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
++proc_num;
}
}
} else {
// walk through requested core
for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
if (k < __kmp_hws_proc.num) {
if (pAddr) // collect requested thread's data
newAddr[n_new] = (*pAddr)[n_old];
n_new++;
} else {
if (__kmp_pu_os_idx != NULL)
KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
}
n_old++;
++proc_num;
}
}
}
}
}
KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
KMP_DEBUG_ASSERT(n_new ==
__kmp_hws_socket.num * __kmp_hws_core.num *
__kmp_hws_proc.num);
nPackages = __kmp_hws_socket.num; // correct nPackages
nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
__kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
__kmp_avail_proc = n_new; // correct avail_proc
__kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
} // non-hwloc topology method
if (pAddr) {
__kmp_free(*pAddr);
*pAddr = newAddr; // replace old topology with new one
}
if (__kmp_affinity_verbose) {
char m[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN,
__kmp_affin_fullMask);
if (__kmp_affinity_respect_mask) {
KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
} else {
KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
}
KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
__kmp_str_buf_print(&buf, "%d", nPackages);
KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
__kmp_str_buf_free(&buf);
}
_exit:
if (__kmp_pu_os_idx != NULL) {
__kmp_free(__kmp_pu_os_idx);
__kmp_pu_os_idx = NULL;
}
}
// This function figures out the deepest level at which there is at least one
// cluster/core with more than one processing unit bound to it.
static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
int nprocs, int bottom_level) {
int core_level = 0;
for (int i = 0; i < nprocs; i++) {
for (int j = bottom_level; j > 0; j--) {
if (address2os[i].first.labels[j] > 0) {
if (core_level < (j - 1)) {
core_level = j - 1;
}
}
}
}
return core_level;
}
// This function counts number of clusters/cores at given level.
static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
int nprocs, int bottom_level,
int core_level) {
int ncores = 0;
int i, j;
j = bottom_level;
for (i = 0; i < nprocs; i++) {
for (j = bottom_level; j > core_level; j--) {
if ((i + 1) < nprocs) {
if (address2os[i + 1].first.labels[j] > 0) {
break;
}
}
}
if (j == core_level) {
ncores++;
}
}
if (j > core_level) {
// In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
// core. May occur when called from __kmp_affinity_find_core().
ncores++;
}
return ncores;
}
// This function finds to which cluster/core given processing unit is bound.
static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
int bottom_level, int core_level) {
return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
core_level) -
1;
}
// This function finds maximal number of processing units bound to a
// cluster/core at given level.
static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
int nprocs, int bottom_level,
int core_level) {
int maxprocpercore = 0;
if (core_level < bottom_level) {
for (int i = 0; i < nprocs; i++) {
int percore = address2os[i].first.labels[core_level + 1] + 1;
if (percore > maxprocpercore) {
maxprocpercore = percore;
}
}
} else {
maxprocpercore = 1;
}
return maxprocpercore;
}
static AddrUnsPair *address2os = NULL;
static int *procarr = NULL;
static int __kmp_aff_depth = 0;
#if KMP_USE_HIER_SCHED
#define KMP_EXIT_AFF_NONE \
KMP_ASSERT(__kmp_affinity_type == affinity_none); \
KMP_ASSERT(address2os == NULL); \
__kmp_apply_thread_places(NULL, 0); \
__kmp_create_affinity_none_places(); \
__kmp_dispatch_set_hierarchy_values(); \
return;
#else
#define KMP_EXIT_AFF_NONE \
KMP_ASSERT(__kmp_affinity_type == affinity_none); \
KMP_ASSERT(address2os == NULL); \
__kmp_apply_thread_places(NULL, 0); \
__kmp_create_affinity_none_places(); \
return;
#endif
// Create a one element mask array (set of places) which only contains the
// initial process's affinity mask
static void __kmp_create_affinity_none_places() {
KMP_ASSERT(__kmp_affin_fullMask != NULL);
KMP_ASSERT(__kmp_affinity_type == affinity_none);
__kmp_affinity_num_masks = 1;
KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
KMP_CPU_COPY(dest, __kmp_affin_fullMask);
}
static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
const Address *aa = &(((const AddrUnsPair *)a)->first);
const Address *bb = &(((const AddrUnsPair *)b)->first);
unsigned depth = aa->depth;
unsigned i;
KMP_DEBUG_ASSERT(depth == bb->depth);
KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
int j = depth - i - 1;
if (aa->childNums[j] < bb->childNums[j])
return -1;
if (aa->childNums[j] > bb->childNums[j])
return 1;
}
for (; i < depth; i++) {
int j = i - __kmp_affinity_compact;
if (aa->childNums[j] < bb->childNums[j])
return -1;
if (aa->childNums[j] > bb->childNums[j])
return 1;
}
return 0;
}
static void __kmp_aux_affinity_initialize(void) {
if (__kmp_affinity_masks != NULL) {
KMP_ASSERT(__kmp_affin_fullMask != NULL);
return;
}
// Create the "full" mask - this defines all of the processors that we
// consider to be in the machine model. If respect is set, then it is the
// initialization thread's affinity mask. Otherwise, it is all processors that
// we know about on the machine.
if (__kmp_affin_fullMask == NULL) {
KMP_CPU_ALLOC(__kmp_affin_fullMask);
}
if (KMP_AFFINITY_CAPABLE()) {
if (__kmp_affinity_respect_mask) {
__kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
// Count the number of available processors.
unsigned i;
__kmp_avail_proc = 0;
KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
continue;
}
__kmp_avail_proc++;
}
if (__kmp_avail_proc > __kmp_xproc) {
if (__kmp_affinity_verbose ||
(__kmp_affinity_warnings &&
(__kmp_affinity_type != affinity_none))) {
KMP_WARNING(ErrorInitializeAffinity);
}
__kmp_affinity_type = affinity_none;
KMP_AFFINITY_DISABLE();
return;
}
} else {
__kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
__kmp_avail_proc = __kmp_xproc;
}
}
if (__kmp_affinity_gran == affinity_gran_tile &&
// check if user's request is valid
__kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) {
KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY");
__kmp_affinity_gran = affinity_gran_package;
}
int depth = -1;
kmp_i18n_id_t msg_id = kmp_i18n_null;
// For backward compatibility, setting KMP_CPUINFO_FILE =>
// KMP_TOPOLOGY_METHOD=cpuinfo
if ((__kmp_cpuinfo_file != NULL) &&
(__kmp_affinity_top_method == affinity_top_method_all)) {
__kmp_affinity_top_method = affinity_top_method_cpuinfo;
}
if (__kmp_affinity_top_method == affinity_top_method_all) {
// In the default code path, errors are not fatal - we just try using
// another method. We only emit a warning message if affinity is on, or the
// verbose flag is set, an the nowarnings flag was not set.
const char *file_name = NULL;
int line = 0;
#if KMP_USE_HWLOC
if (depth < 0 &&
__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
if (__kmp_affinity_verbose) {
KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
}
if (!__kmp_hwloc_error) {
depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
if (depth == 0) {
KMP_EXIT_AFF_NONE;
} else if (depth < 0 && __kmp_affinity_verbose) {
KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
}
} else if (__kmp_affinity_verbose) {
KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
}
}
#endif
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
if (depth < 0) {
if (__kmp_affinity_verbose) {
KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
}
file_name = NULL;
depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
if (depth == 0) {
KMP_EXIT_AFF_NONE;
}
if (depth < 0) {
if (__kmp_affinity_verbose) {
if (msg_id != kmp_i18n_null) {
KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
__kmp_i18n_catgets(msg_id),
KMP_I18N_STR(DecodingLegacyAPIC));
} else {
KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
KMP_I18N_STR(DecodingLegacyAPIC));
}
}
file_name = NULL;
depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
if (depth == 0) {
KMP_EXIT_AFF_NONE;
}
}
}
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
#if KMP_OS_LINUX
if (depth < 0) {
if (__kmp_affinity_verbose) {
if (msg_id != kmp_i18n_null) {
KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
__kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
} else {
KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
}
}
FILE *f = fopen("/proc/cpuinfo", "r");
if (f == NULL) {
msg_id = kmp_i18n_str_CantOpenCpuinfo;
} else {
file_name = "/proc/cpuinfo";
depth =
__kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
fclose(f);
if (depth == 0) {
KMP_EXIT_AFF_NONE;
}
}
}
#endif /* KMP_OS_LINUX */
#if KMP_GROUP_AFFINITY
if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
if (__kmp_affinity_verbose) {
KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
}
depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
KMP_ASSERT(depth != 0);
}
#endif /* KMP_GROUP_AFFINITY */
if (depth < 0) {
if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
if (file_name == NULL) {
KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
} else if (line == 0) {
KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
} else {
KMP_INFORM(UsingFlatOSFileLine, file_name, line,
__kmp_i18n_catgets(msg_id));
}
}
// FIXME - print msg if msg_id = kmp_i18n_null ???
file_name = "";
depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
if (depth == 0) {
KMP_EXIT_AFF_NONE;
}
KMP_ASSERT(depth > 0);
KMP_ASSERT(address2os != NULL);
}
}
#if KMP_USE_HWLOC
else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
if (__kmp_affinity_verbose) {
KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
}
depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
if (depth == 0) {
KMP_EXIT_AFF_NONE;
}
}
#endif // KMP_USE_HWLOC
// If the user has specified that a paricular topology discovery method is to be
// used, then we abort if that method fails. The exception is group affinity,
// which might have been implicitly set.
#if KMP_ARCH_X86 || KMP_ARCH_X86_64
else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
if (__kmp_affinity_verbose) {
KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
}
depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
if (depth == 0) {
KMP_EXIT_AFF_NONE;
}
if (depth < 0) {
KMP_ASSERT(msg_id != kmp_i18n_null);
KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
}
} else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
if (__kmp_affinity_verbose) {
KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
}
depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
if (depth == 0) {
KMP_EXIT_AFF_NONE;
}
if (depth < 0) {
KMP_ASSERT(msg_id != kmp_i18n_null);
KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
}
}
#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
const char *filename;
if (__kmp_cpuinfo_file != NULL) {
filename = __kmp_cpuinfo_file;
} else {
filename = "/proc/cpuinfo";
}
if (__kmp_affinity_verbose) {
KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
}
FILE *f = fopen(filename, "r");
if (f == NULL) {
int code = errno;
if (__kmp_cpuinfo_file != NULL) {
__kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null);
} else {
__kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
__kmp_msg_null);
}
}
int line = 0;
depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
fclose(f);
if (depth < 0) {
KMP_ASSERT(msg_id != kmp_i18n_null);
if (line > 0) {
KMP_FATAL(FileLineMsgExiting, filename, line,
__kmp_i18n_catgets(msg_id));
} else {
KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
}
}
if (__kmp_affinity_type == affinity_none) {
KMP_ASSERT(depth == 0);
KMP_EXIT_AFF_NONE;
}
}
#if KMP_GROUP_AFFINITY
else if (__kmp_affinity_top_method == affinity_top_method_group) {
if (__kmp_affinity_verbose) {
KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
}
depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
KMP_ASSERT(depth != 0);
if (depth < 0) {
KMP_ASSERT(msg_id != kmp_i18n_null);
KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
}
}
#endif /* KMP_GROUP_AFFINITY */
else if (__kmp_affinity_top_method == affinity_top_method_flat) {
if (__kmp_affinity_verbose) {
KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
}
depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
if (depth == 0) {
KMP_EXIT_AFF_NONE;
}
// should not fail
KMP_ASSERT(depth > 0);
KMP_ASSERT(address2os != NULL);
}
#if KMP_USE_HIER_SCHED
__kmp_dispatch_set_hierarchy_values();
#endif
if (address2os == NULL) {
if (KMP_AFFINITY_CAPABLE() &&
(__kmp_affinity_verbose ||
(__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
KMP_WARNING(ErrorInitializeAffinity);
}
__kmp_affinity_type = affinity_none;
__kmp_create_affinity_none_places();
KMP_AFFINITY_DISABLE();
return;
}
if (__kmp_affinity_gran == affinity_gran_tile
#if KMP_USE_HWLOC
&& __kmp_tile_depth == 0
#endif
) {
// tiles requested but not detected, warn user on this
KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
}
__kmp_apply_thread_places(&address2os, depth);
// Create the table of masks, indexed by thread Id.
unsigned maxIndex;
unsigned numUnique;
kmp_affin_mask_t *osId2Mask =
__kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
if (__kmp_affinity_gran_levels == 0) {
KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
}
// Set the childNums vector in all Address objects. This must be done before
// we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
// account the setting of __kmp_affinity_compact.
__kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
switch (__kmp_affinity_type) {
case affinity_explicit:
KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
#if OMP_40_ENABLED
if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel)
#endif
{
__kmp_affinity_process_proclist(
&__kmp_affinity_masks, &__kmp_affinity_num_masks,
__kmp_affinity_proclist, osId2Mask, maxIndex);
}
#if OMP_40_ENABLED
else {
__kmp_affinity_process_placelist(
&__kmp_affinity_masks, &__kmp_affinity_num_masks,
__kmp_affinity_proclist, osId2Mask, maxIndex);
}
#endif
if (__kmp_affinity_num_masks == 0) {
if (__kmp_affinity_verbose ||
(__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
KMP_WARNING(AffNoValidProcID);
}
__kmp_affinity_type = affinity_none;
__kmp_create_affinity_none_places();
return;
}
break;
// The other affinity types rely on sorting the Addresses according to some
// permutation of the machine topology tree. Set __kmp_affinity_compact and
// __kmp_affinity_offset appropriately, then jump to a common code fragment
// to do the sort and create the array of affinity masks.
case affinity_logical:
__kmp_affinity_compact = 0;
if (__kmp_affinity_offset) {
__kmp_affinity_offset =
__kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
}
goto sortAddresses;
case affinity_physical:
if (__kmp_nThreadsPerCore > 1) {
__kmp_affinity_compact = 1;
if (__kmp_affinity_compact >= depth) {
__kmp_affinity_compact = 0;
}
} else {
__kmp_affinity_compact = 0;
}
if (__kmp_affinity_offset) {
__kmp_affinity_offset =
__kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
}
goto sortAddresses;
case affinity_scatter:
if (__kmp_affinity_compact >= depth) {
__kmp_affinity_compact = 0;
} else {
__kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
}
goto sortAddresses;
case affinity_compact:
if (__kmp_affinity_compact >= depth) {
__kmp_affinity_compact = depth - 1;
}
goto sortAddresses;
case affinity_balanced:
if (depth <= 1) {
if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
}
__kmp_affinity_type = affinity_none;
__kmp_create_affinity_none_places();
return;
} else if (!__kmp_affinity_uniform_topology()) {
// Save the depth for further usage
__kmp_aff_depth = depth;
int core_level = __kmp_affinity_find_core_level(
address2os, __kmp_avail_proc, depth - 1);
int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
depth - 1, core_level);
int maxprocpercore = __kmp_affinity_max_proc_per_core(
address2os, __kmp_avail_proc, depth - 1, core_level);
int nproc = ncores * maxprocpercore;
if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
}
__kmp_affinity_type = affinity_none;
return;
}
procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
for (int i = 0; i < nproc; i++) {
procarr[i] = -1;
}
int lastcore = -1;
int inlastcore = 0;
for (int i = 0; i < __kmp_avail_proc; i++) {
int proc = address2os[i].second;
int core =
__kmp_affinity_find_core(address2os, i, depth - 1, core_level);
if (core == lastcore) {
inlastcore++;
} else {
inlastcore = 0;
}
lastcore = core;
procarr[core * maxprocpercore + inlastcore] = proc;
}
}
if (__kmp_affinity_compact >= depth) {
__kmp_affinity_compact = depth - 1;
}
sortAddresses:
// Allocate the gtid->affinity mask table.
if (__kmp_affinity_dups) {
__kmp_affinity_num_masks = __kmp_avail_proc;
} else {
__kmp_affinity_num_masks = numUnique;
}
#if OMP_40_ENABLED
if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
(__kmp_affinity_num_places > 0) &&
((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
__kmp_affinity_num_masks = __kmp_affinity_num_places;
}
#endif
KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
// Sort the address2os table according to the current setting of
// __kmp_affinity_compact, then fill out __kmp_affinity_masks.
qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
__kmp_affinity_cmp_Address_child_num);
{
int i;
unsigned j;
for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
continue;
}
unsigned osId = address2os[i].second;
kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
KMP_ASSERT(KMP_CPU_ISSET(osId, src));
KMP_CPU_COPY(dest, src);
if (++j >= __kmp_affinity_num_masks) {
break;
}
}
KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
}
break;
default:
KMP_ASSERT2(0, "Unexpected affinity setting");
}
KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
machine_hierarchy.init(address2os, __kmp_avail_proc);
}
#undef KMP_EXIT_AFF_NONE
void __kmp_affinity_initialize(void) {
// Much of the code above was written assumming that if a machine was not
// affinity capable, then __kmp_affinity_type == affinity_none. We now
// explicitly represent this as __kmp_affinity_type == affinity_disabled.
// There are too many checks for __kmp_affinity_type == affinity_none
// in this code. Instead of trying to change them all, check if
// __kmp_affinity_type == affinity_disabled, and if so, slam it with
// affinity_none, call the real initialization routine, then restore
// __kmp_affinity_type to affinity_disabled.
int disabled = (__kmp_affinity_type == affinity_disabled);
if (!KMP_AFFINITY_CAPABLE()) {
KMP_ASSERT(disabled);
}
if (disabled) {
__kmp_affinity_type = affinity_none;
}
__kmp_aux_affinity_initialize();
if (disabled) {
__kmp_affinity_type = affinity_disabled;
}
}
void __kmp_affinity_uninitialize(void) {
if (__kmp_affinity_masks != NULL) {
KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
__kmp_affinity_masks = NULL;
}
if (__kmp_affin_fullMask != NULL) {
KMP_CPU_FREE(__kmp_affin_fullMask);
__kmp_affin_fullMask = NULL;
}
__kmp_affinity_num_masks = 0;
__kmp_affinity_type = affinity_default;
#if OMP_40_ENABLED
__kmp_affinity_num_places = 0;
#endif
if (__kmp_affinity_proclist != NULL) {
__kmp_free(__kmp_affinity_proclist);
__kmp_affinity_proclist = NULL;
}
if (address2os != NULL) {
__kmp_free(address2os);
address2os = NULL;
}
if (procarr != NULL) {
__kmp_free(procarr);
procarr = NULL;
}
#if KMP_USE_HWLOC
if (__kmp_hwloc_topology != NULL) {
hwloc_topology_destroy(__kmp_hwloc_topology);
__kmp_hwloc_topology = NULL;
}
#endif
KMPAffinity::destroy_api();
}
void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
if (!KMP_AFFINITY_CAPABLE()) {
return;
}
kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
if (th->th.th_affin_mask == NULL) {
KMP_CPU_ALLOC(th->th.th_affin_mask);
} else {
KMP_CPU_ZERO(th->th.th_affin_mask);
}
// Copy the thread mask to the kmp_info_t strucuture. If
// __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
// has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
// then the full mask is the same as the mask of the initialization thread.
kmp_affin_mask_t *mask;
int i;
#if OMP_40_ENABLED
if (KMP_AFFINITY_NON_PROC_BIND)
#endif
{
if ((__kmp_affinity_type == affinity_none) ||
(__kmp_affinity_type == affinity_balanced)) {
#if KMP_GROUP_AFFINITY
if (__kmp_num_proc_groups > 1) {
return;
}
#endif
KMP_ASSERT(__kmp_affin_fullMask != NULL);
i = 0;
mask = __kmp_affin_fullMask;
} else {
KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
}
}
#if OMP_40_ENABLED
else {
if ((!isa_root) ||
(__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
#if KMP_GROUP_AFFINITY
if (__kmp_num_proc_groups > 1) {
return;
}
#endif
KMP_ASSERT(__kmp_affin_fullMask != NULL);
i = KMP_PLACE_ALL;
mask = __kmp_affin_fullMask;
} else {
// int i = some hash function or just a counter that doesn't
// always start at 0. Use gtid for now.
KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
}
}
#endif
#if OMP_40_ENABLED
th->th.th_current_place = i;
if (isa_root) {
th->th.th_new_place = i;
th->th.th_first_place = 0;
th->th.th_last_place = __kmp_affinity_num_masks - 1;
} else if (KMP_AFFINITY_NON_PROC_BIND) {
// When using a Non-OMP_PROC_BIND affinity method,
// set all threads' place-partition-var to the entire place list
th->th.th_first_place = 0;
th->th.th_last_place = __kmp_affinity_num_masks - 1;
}
if (i == KMP_PLACE_ALL) {
KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
gtid));
} else {
KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
gtid, i));
}
#else
if (i == -1) {
KA_TRACE(
100,
("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n",
gtid));
} else {
KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n",
gtid, i));
}
#endif /* OMP_40_ENABLED */
KMP_CPU_COPY(th->th.th_affin_mask, mask);
if (__kmp_affinity_verbose
/* to avoid duplicate printing (will be correctly printed on barrier) */
&& (__kmp_affinity_type == affinity_none ||
(i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
th->th.th_affin_mask);
KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
__kmp_gettid(), gtid, buf);
}
#if KMP_OS_WINDOWS
// On Windows* OS, the process affinity mask might have changed. If the user
// didn't request affinity and this call fails, just continue silently.
// See CQ171393.
if (__kmp_affinity_type == affinity_none) {
__kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
} else
#endif
__kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
}
#if OMP_40_ENABLED
void __kmp_affinity_set_place(int gtid) {
if (!KMP_AFFINITY_CAPABLE()) {
return;
}
kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
"place = %d)\n",
gtid, th->th.th_new_place, th->th.th_current_place));
// Check that the new place is within this thread's partition.
KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
KMP_ASSERT(th->th.th_new_place >= 0);
KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
if (th->th.th_first_place <= th->th.th_last_place) {
KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
(th->th.th_new_place <= th->th.th_last_place));
} else {
KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
(th->th.th_new_place >= th->th.th_last_place));
}
// Copy the thread mask to the kmp_info_t strucuture,
// and set this thread's affinity.
kmp_affin_mask_t *mask =
KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
KMP_CPU_COPY(th->th.th_affin_mask, mask);
th->th.th_current_place = th->th.th_new_place;
if (__kmp_affinity_verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
th->th.th_affin_mask);
KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
__kmp_gettid(), gtid, buf);
}
__kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
}
#endif /* OMP_40_ENABLED */
int __kmp_aux_set_affinity(void **mask) {
int gtid;
kmp_info_t *th;
int retval;
if (!KMP_AFFINITY_CAPABLE()) {
return -1;
}
gtid = __kmp_entry_gtid();
KA_TRACE(1000, (""); {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
(kmp_affin_mask_t *)(*mask));
__kmp_debug_printf(
"kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid,
buf);
});
if (__kmp_env_consistency_check) {
if ((mask == NULL) || (*mask == NULL)) {
KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
} else {
unsigned proc;
int num_procs = 0;
KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
}
if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
continue;
}
num_procs++;
}
if (num_procs == 0) {
KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
}
#if KMP_GROUP_AFFINITY
if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
}
#endif /* KMP_GROUP_AFFINITY */
}
}
th = __kmp_threads[gtid];
KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
if (retval == 0) {
KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
}
#if OMP_40_ENABLED
th->th.th_current_place = KMP_PLACE_UNDEFINED;
th->th.th_new_place = KMP_PLACE_UNDEFINED;
th->th.th_first_place = 0;
th->th.th_last_place = __kmp_affinity_num_masks - 1;
// Turn off 4.0 affinity for the current tread at this parallel level.
th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
#endif
return retval;
}
int __kmp_aux_get_affinity(void **mask) {
int gtid;
int retval;
kmp_info_t *th;
if (!KMP_AFFINITY_CAPABLE()) {
return -1;
}
gtid = __kmp_entry_gtid();
th = __kmp_threads[gtid];
KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
KA_TRACE(1000, (""); {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
th->th.th_affin_mask);
__kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
gtid, buf);
});
if (__kmp_env_consistency_check) {
if ((mask == NULL) || (*mask == NULL)) {
KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
}
}
#if !KMP_OS_WINDOWS
retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
KA_TRACE(1000, (""); {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
(kmp_affin_mask_t *)(*mask));
__kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
gtid, buf);
});
return retval;
#else
KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
return 0;
#endif /* KMP_OS_WINDOWS */
}
int __kmp_aux_get_affinity_max_proc() {
if (!KMP_AFFINITY_CAPABLE()) {
return 0;
}
#if KMP_GROUP_AFFINITY
if (__kmp_num_proc_groups > 1) {
return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
}
#endif
return __kmp_xproc;
}
int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
if (!KMP_AFFINITY_CAPABLE()) {
return -1;
}
KA_TRACE(1000, (""); {
int gtid = __kmp_entry_gtid();
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
(kmp_affin_mask_t *)(*mask));
__kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
"affinity mask for thread %d = %s\n",
proc, gtid, buf);
});
if (__kmp_env_consistency_check) {
if ((mask == NULL) || (*mask == NULL)) {
KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
}
}
if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
return -1;
}
if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
return -2;
}
KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
return 0;
}
int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
if (!KMP_AFFINITY_CAPABLE()) {
return -1;
}
KA_TRACE(1000, (""); {
int gtid = __kmp_entry_gtid();
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
(kmp_affin_mask_t *)(*mask));
__kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
"affinity mask for thread %d = %s\n",
proc, gtid, buf);
});
if (__kmp_env_consistency_check) {
if ((mask == NULL) || (*mask == NULL)) {
KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
}
}
if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
return -1;
}
if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
return -2;
}
KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
return 0;
}
int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
if (!KMP_AFFINITY_CAPABLE()) {
return -1;
}
KA_TRACE(1000, (""); {
int gtid = __kmp_entry_gtid();
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
(kmp_affin_mask_t *)(*mask));
__kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
"affinity mask for thread %d = %s\n",
proc, gtid, buf);
});
if (__kmp_env_consistency_check) {
if ((mask == NULL) || (*mask == NULL)) {
KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
}
}
if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
return -1;
}
if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
return 0;
}
return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
}
// Dynamic affinity settings - Affinity balanced
void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
KMP_DEBUG_ASSERT(th);
bool fine_gran = true;
int tid = th->th.th_info.ds.ds_tid;
switch (__kmp_affinity_gran) {
case affinity_gran_fine:
case affinity_gran_thread:
break;
case affinity_gran_core:
if (__kmp_nThreadsPerCore > 1) {
fine_gran = false;
}
break;
case affinity_gran_package:
if (nCoresPerPkg > 1) {
fine_gran = false;
}
break;
default:
fine_gran = false;
}
if (__kmp_affinity_uniform_topology()) {
int coreID;
int threadID;
// Number of hyper threads per core in HT machine
int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
// Number of cores
int ncores = __kmp_ncores;
if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
__kmp_nth_per_core = __kmp_avail_proc / nPackages;
ncores = nPackages;
}
// How many threads will be bound to each core
int chunk = nthreads / ncores;
// How many cores will have an additional thread bound to it - "big cores"
int big_cores = nthreads % ncores;
// Number of threads on the big cores
int big_nth = (chunk + 1) * big_cores;
if (tid < big_nth) {
coreID = tid / (chunk + 1);
threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
} else { // tid >= big_nth
coreID = (tid - big_cores) / chunk;
threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
}
KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
"Illegal set affinity operation when not capable");
kmp_affin_mask_t *mask = th->th.th_affin_mask;
KMP_CPU_ZERO(mask);
if (fine_gran) {
int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
KMP_CPU_SET(osID, mask);
} else {
for (int i = 0; i < __kmp_nth_per_core; i++) {
int osID;
osID = address2os[coreID * __kmp_nth_per_core + i].second;
KMP_CPU_SET(osID, mask);
}
}
if (__kmp_affinity_verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
__kmp_gettid(), tid, buf);
}
__kmp_set_system_affinity(mask, TRUE);
} else { // Non-uniform topology
kmp_affin_mask_t *mask = th->th.th_affin_mask;
KMP_CPU_ZERO(mask);
int core_level = __kmp_affinity_find_core_level(
address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
__kmp_aff_depth - 1, core_level);
int nth_per_core = __kmp_affinity_max_proc_per_core(
address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
// For performance gain consider the special case nthreads ==
// __kmp_avail_proc
if (nthreads == __kmp_avail_proc) {
if (fine_gran) {
int osID = address2os[tid].second;
KMP_CPU_SET(osID, mask);
} else {
int core = __kmp_affinity_find_core(address2os, tid,
__kmp_aff_depth - 1, core_level);
for (int i = 0; i < __kmp_avail_proc; i++) {
int osID = address2os[i].second;
if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
core_level) == core) {
KMP_CPU_SET(osID, mask);
}
}
}
} else if (nthreads <= ncores) {
int core = 0;
for (int i = 0; i < ncores; i++) {
// Check if this core from procarr[] is in the mask
int in_mask = 0;
for (int j = 0; j < nth_per_core; j++) {
if (procarr[i * nth_per_core + j] != -1) {
in_mask = 1;
break;
}
}
if (in_mask) {
if (tid == core) {
for (int j = 0; j < nth_per_core; j++) {
int osID = procarr[i * nth_per_core + j];
if (osID != -1) {
KMP_CPU_SET(osID, mask);
// For fine granularity it is enough to set the first available
// osID for this core
if (fine_gran) {
break;
}
}
}
break;
} else {
core++;
}
}
}
} else { // nthreads > ncores
// Array to save the number of processors at each core
int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
// Array to save the number of cores with "x" available processors;
int *ncores_with_x_procs =
(int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
// Array to save the number of cores with # procs from x to nth_per_core
int *ncores_with_x_to_max_procs =
(int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
for (int i = 0; i <= nth_per_core; i++) {
ncores_with_x_procs[i] = 0;
ncores_with_x_to_max_procs[i] = 0;
}
for (int i = 0; i < ncores; i++) {
int cnt = 0;
for (int j = 0; j < nth_per_core; j++) {
if (procarr[i * nth_per_core + j] != -1) {
cnt++;
}
}
nproc_at_core[i] = cnt;
ncores_with_x_procs[cnt]++;
}
for (int i = 0; i <= nth_per_core; i++) {
for (int j = i; j <= nth_per_core; j++) {
ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
}
}
// Max number of processors
int nproc = nth_per_core * ncores;
// An array to keep number of threads per each context
int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
for (int i = 0; i < nproc; i++) {
newarr[i] = 0;
}
int nth = nthreads;
int flag = 0;
while (nth > 0) {
for (int j = 1; j <= nth_per_core; j++) {
int cnt = ncores_with_x_to_max_procs[j];
for (int i = 0; i < ncores; i++) {
// Skip the core with 0 processors
if (nproc_at_core[i] == 0) {
continue;
}
for (int k = 0; k < nth_per_core; k++) {
if (procarr[i * nth_per_core + k] != -1) {
if (newarr[i * nth_per_core + k] == 0) {
newarr[i * nth_per_core + k] = 1;
cnt--;
nth--;
break;
} else {
if (flag != 0) {
newarr[i * nth_per_core + k]++;
cnt--;
nth--;
break;
}
}
}
}
if (cnt == 0 || nth == 0) {
break;
}
}
if (nth == 0) {
break;
}
}
flag = 1;
}
int sum = 0;
for (int i = 0; i < nproc; i++) {
sum += newarr[i];
if (sum > tid) {
if (fine_gran) {
int osID = procarr[i];
KMP_CPU_SET(osID, mask);
} else {
int coreID = i / nth_per_core;
for (int ii = 0; ii < nth_per_core; ii++) {
int osID = procarr[coreID * nth_per_core + ii];
if (osID != -1) {
KMP_CPU_SET(osID, mask);
}
}
}
break;
}
}
__kmp_free(newarr);
}
if (__kmp_affinity_verbose) {
char buf[KMP_AFFIN_MASK_PRINT_LEN];
__kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
__kmp_gettid(), tid, buf);
}
__kmp_set_system_affinity(mask, TRUE);
}
}
#if KMP_OS_LINUX
// We don't need this entry for Windows because
// there is GetProcessAffinityMask() api
//
// The intended usage is indicated by these steps:
// 1) The user gets the current affinity mask
// 2) Then sets the affinity by calling this function
// 3) Error check the return value
// 4) Use non-OpenMP parallelization
// 5) Reset the affinity to what was stored in step 1)
#ifdef __cplusplus
extern "C"
#endif
int
kmp_set_thread_affinity_mask_initial()
// the function returns 0 on success,
// -1 if we cannot bind thread
// >0 (errno) if an error happened during binding
{
int gtid = __kmp_get_gtid();
if (gtid < 0) {
// Do not touch non-omp threads
KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
"non-omp thread, returning\n"));
return -1;
}
if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
"affinity not initialized, returning\n"));
return -1;
}
KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
"set full mask for thread %d\n",
gtid));
KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
}
#endif
#endif // KMP_AFFINITY_SUPPORTED