<aside> 💡 이 문서는 Elastic Horovod의 커뮤니케이션 풀 형성 과정을 소개합니다.
</aside>
void BackgroundThreadLoop(HorovodGlobalState& state) {
...
#if HAVE_GLOO
#if HAVE_MPI
if (global_mpi_context.IsEnabled()) {
// Initialize gloo context if mpi context is available
global_gloo_context.InitializeFromMPI(
state.process_set_table.Get(0).mpi_context, ParseGlooIface());
} else
#endif // HAVE_MPI
{
global_gloo_context.Initialize(ParseGlooIface());
}
if (state.control_operation == LibType::GLOO) {
// Initializes global controller
state.process_set_table.Initialize(global_gloo_context);
}
#endif // HAVE_GLOO
ParseGlooIface()는 HOROVOD_GLOO_IFACE, 즉 --network-interface 값이다.
void GlooContext::Initialize(const std::string& gloo_iface) {
...
ctx = Rendezvous(HOROVOD_GLOO_GLOBAL_PREFIX,
rendezvous_addr_env, rendezvous_port,
rank, size, dev, timeout_);
LOG(DEBUG) << "Global Gloo context initialized.";
global_ctx = ctx;
local_ctx = Rendezvous(HOROVOD_GLOO_LOCAL_PREFIX + hostname_,
rendezvous_addr_env, rendezvous_port,
local_rank, local_size, dev, timeout_);
LOG(DEBUG) << "Local Gloo context initialized.";
cross_ctx = Rendezvous(HOROVOD_GLOO_CROSS_PREFIX + std::to_string(local_rank),
rendezvous_addr_env, rendezvous_port,
cross_rank, cross_size, dev, timeout_);
LOG(DEBUG) << "Cross-node Gloo context initialized.";
}
위와 같이 ctx, local_ctx, cross_ctx에 대해 Rendezvous를 생성한다.
std::shared_ptr<gloo::Context> Rendezvous(const std::string& prefix,
const char* server_addr_env, int server_port,
int rank, int size,
std::shared_ptr<gloo::transport::Device>& dev,
std::chrono::milliseconds timeout) {
std::unique_ptr<GlooStore> store;
if (server_addr_env != nullptr) {
std::string server_addr = server_addr_env;
store.reset(new HTTPStore(server_addr, server_port, prefix, rank));
} else {
store.reset(new MemoryStore());
}
LOG(DEBUG) << prefix << " rendezvous started for rank=" << rank << ", size=" << size
<< ", dev={" << dev->str() << "}, timeout="
<< std::to_string(std::chrono::duration_cast<std::chrono::seconds>(timeout).count());
auto context = std::make_shared<gloo::rendezvous::Context>(rank, size);
context->setTimeout(timeout);
context->connectFullMesh(*store, dev);
{
// Don't finalize the store until all clients have had a chance to connect.
gloo::BarrierOptions opts(context);
opts.setTimeout(timeout);
gloo::barrier(opts);
}
store->Finalize();
return context;
}
Rendezvous는 아래와 같은 과정으로 진행된다.
DeviceStoregloo::rendezvous::Context (== gloo::Context)connectFullMesh()Rendezvous를 통해 모든 프로세스간 커뮤니케이션 풀이 형성된다.
void Pair::handleConnecting() {
...
rv = getsockopt(fd_, SOL_SOCKET, SO_ERROR, &optval, &optlen);
...
handleConnected();
}