<aside> 💡 이 문서는 Elastic Horovod의 커뮤니케이션 풀 형성 과정을 소개합니다.

</aside>

1. Rendezvous

void BackgroundThreadLoop(HorovodGlobalState& state) {
...
#if HAVE_GLOO
#if HAVE_MPI
  if (global_mpi_context.IsEnabled()) {
    // Initialize gloo context if mpi context is available
    global_gloo_context.InitializeFromMPI(
        state.process_set_table.Get(0).mpi_context, ParseGlooIface());
  } else
#endif // HAVE_MPI
  {
    global_gloo_context.Initialize(ParseGlooIface());
  }
  if (state.control_operation == LibType::GLOO) {
    // Initializes global controller
    state.process_set_table.Initialize(global_gloo_context);
  }
#endif // HAVE_GLOO

ParseGlooIface()HOROVOD_GLOO_IFACE, 즉 --network-interface 값이다.

void GlooContext::Initialize(const std::string& gloo_iface) {
  ...
	ctx = Rendezvous(HOROVOD_GLOO_GLOBAL_PREFIX,
                   rendezvous_addr_env, rendezvous_port,
                   rank, size, dev, timeout_);
  LOG(DEBUG) << "Global Gloo context initialized.";

  global_ctx = ctx;

  local_ctx = Rendezvous(HOROVOD_GLOO_LOCAL_PREFIX + hostname_,
                         rendezvous_addr_env, rendezvous_port,
                         local_rank, local_size, dev, timeout_);
  LOG(DEBUG) << "Local Gloo context initialized.";

  cross_ctx = Rendezvous(HOROVOD_GLOO_CROSS_PREFIX + std::to_string(local_rank),
                         rendezvous_addr_env, rendezvous_port,
                         cross_rank, cross_size, dev, timeout_);
  LOG(DEBUG) << "Cross-node Gloo context initialized.";
}

위와 같이 ctx, local_ctx, cross_ctx에 대해 Rendezvous를 생성한다.

std::shared_ptr<gloo::Context> Rendezvous(const std::string& prefix,
																					const char* server_addr_env, int server_port,
                                          int rank, int size,
                                          std::shared_ptr<gloo::transport::Device>& dev,
                                          std::chrono::milliseconds timeout) {
  std::unique_ptr<GlooStore> store;
  if (server_addr_env != nullptr) {
    std::string server_addr = server_addr_env;
    store.reset(new HTTPStore(server_addr, server_port, prefix, rank));
  } else {
    store.reset(new MemoryStore());
  }
  LOG(DEBUG) << prefix << " rendezvous started for rank=" << rank << ", size=" << size
             << ", dev={" << dev->str() << "}, timeout="
             << std::to_string(std::chrono::duration_cast<std::chrono::seconds>(timeout).count());

  auto context = std::make_shared<gloo::rendezvous::Context>(rank, size);
  context->setTimeout(timeout);
  context->connectFullMesh(*store, dev);
  {
    // Don't finalize the store until all clients have had a chance to connect.
    gloo::BarrierOptions opts(context);
    opts.setTimeout(timeout);
    gloo::barrier(opts);
  }
  store->Finalize();
  return context;
}

Rendezvous는 아래와 같은 과정으로 진행된다.

Rendezvous를 통해 모든 프로세스간 커뮤니케이션 풀이 형성된다.

void Pair::handleConnecting() {
  ...
  rv = getsockopt(fd_, SOL_SOCKET, SO_ERROR, &optval, &optlen);
  ...
  handleConnected();
}

2. Algorithms