Source code for tensorflowonspark.gpu_info

# Copyright 2017 Yahoo Inc.
# Licensed under the terms of the Apache 2.0 license.
# Please see LICENSE file in the project root for terms.

from __future__ import absolute_import
from __future__ import division
from __future__ import nested_scopes
from __future__ import print_function

import logging
import random
import subprocess
import time

logger = logging.getLogger(__name__)

MAX_RETRIES = 3           #: Maximum retries to allocate GPUs
AS_STRING = 'string'
AS_LIST = 'list'


[docs]def is_gpu_available(): """Determine if GPUs are available on the host""" try: subprocess.check_output(["nvidia-smi", "--list-gpus"]) return True except Exception: return False
[docs]def get_gpus(num_gpu=1, worker_index=-1, format=AS_STRING): """Get list of free GPUs according to nvidia-smi. This will retry for ``MAX_RETRIES`` times until the requested number of GPUs are available. Args: :num_gpu: number of GPUs desired. :worker_index: index "hint" for allocation of available GPUs. Returns: Comma-delimited string of GPU ids, or raises an Exception if the requested number of GPUs could not be found. """ # get list of gpus (index, uuid) list_gpus = subprocess.check_output(["nvidia-smi", "--list-gpus"]).decode() logger.debug("all GPUs:\n{0}".format(list_gpus)) # parse index and guid gpus = [x for x in list_gpus.split('\n') if len(x) > 0] def parse_gpu(gpu_str): cols = gpu_str.split(' ') return cols[5].split(')')[0], cols[1].split(':')[0] gpu_list = [parse_gpu(gpu) for gpu in gpus] free_gpus = [] retries = 0 while len(free_gpus) < num_gpu and retries < MAX_RETRIES: smi_output = subprocess.check_output(["nvidia-smi", "--format=csv,noheader,nounits", "--query-compute-apps=gpu_uuid"]).decode() logger.debug("busy GPUs:\n{0}".format(smi_output)) busy_uuids = [x for x in smi_output.split('\n') if len(x) > 0] for uuid, index in gpu_list: if uuid not in busy_uuids: free_gpus.append(index) if len(free_gpus) < num_gpu: logger.warn("Unable to find available GPUs: requested={0}, available={1}".format(num_gpu, len(free_gpus))) retries += 1 time.sleep(30 * retries) free_gpus = [] logger.info("Available GPUs: {}".format(free_gpus)) # if still can't find available GPUs, raise exception if len(free_gpus) < num_gpu: smi_output = subprocess.check_output(["nvidia-smi", "--format=csv", "--query-compute-apps=gpu_uuid,pid,process_name,used_gpu_memory"]).decode() logger.info(": {0}".format(smi_output)) raise Exception("Unable to find {} free GPU(s)\n{}".format(num_gpu, smi_output)) # Get logical placement num_available = len(free_gpus) if worker_index == -1: # use original random placement random.shuffle(free_gpus) proposed_gpus = free_gpus[:num_gpu] else: # ordered by worker index if worker_index * num_gpu + num_gpu > num_available: worker_index = worker_index * num_gpu % num_available proposed_gpus = free_gpus[worker_index * num_gpu:(worker_index * num_gpu + num_gpu)] logger.info("Proposed GPUs: {}".format(proposed_gpus)) if format == AS_STRING: return ','.join(str(x) for x in proposed_gpus) elif format == AS_LIST: return proposed_gpus else: raise Exception("Unknown GPU format")