# Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Helper functions for running models in a distributed setting.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf def get_distribution_strategy(num_gpus, all_reduce_alg=None): """Return a DistributionStrategy for running the model. Args: num_gpus: Number of GPUs to run this model. all_reduce_alg: Specify which algorithm to use when performing all-reduce. See tf.contrib.distribute.AllReduceCrossTowerOps for available algorithms. If None, DistributionStrategy will choose based on device topology. Returns: tf.contrib.distribute.DistibutionStrategy object. """ if num_gpus == 0: return tf.contrib.distribute.OneDeviceStrategy("device:CPU:0") elif num_gpus == 1: return tf.contrib.distribute.OneDeviceStrategy("device:GPU:0") else: if all_reduce_alg: return tf.contrib.distribute.MirroredStrategy( num_gpus=num_gpus, cross_tower_ops=tf.contrib.distribute.AllReduceCrossTowerOps( all_reduce_alg, num_packs=num_gpus)) else: return tf.contrib.distribute.CollectiveAllReduceStrategy(num_gpus_per_worker=num_gpus) #return tf.contrib.distribute.MirroredStrategy(num_gpus=num_gpus) def per_device_batch_size(batch_size, num_gpus): """For multi-gpu, batch-size must be a multiple of the number of GPUs. Note that this should eventually be handled by DistributionStrategies directly. Multi-GPU support is currently experimental, however, so doing the work here until that feature is in place. Args: batch_size: Global batch size to be divided among devices. This should be equal to num_gpus times the single-GPU batch_size for multi-gpu training. num_gpus: How many GPUs are used with DistributionStrategies. Returns: Batch size per device. Raises: ValueError: if batch_size is not divisible by number of devices """ if num_gpus <= 1: return batch_size remainder = batch_size % num_gpus if remainder: err = ("When running with multiple GPUs, batch size " "must be a multiple of the number of available GPUs. Found {} " "GPUs with a batch size of {}; try --batch_size={} instead." ).format(num_gpus, batch_size, batch_size - remainder) raise ValueError(err) return int(batch_size / num_gpus)