Source code for gamdpy.simulation.get_default_compute_plan

import numpy as np
import numba
from numba import cuda
import math
import matplotlib.pyplot as plt
import os

[docs] def get_default_compute_plan(configuration): """ Return a default compute_plan The default compute_plan is a dictionary with a set of parameters specifying how computations are done on the GPU. The returned plan depends on the number of particles, and properties of the GPU. The keys of the dictionary are: - 'pb': particle per thread block - 'tp': threads per particle - 'gridsync': Boolean indicating if syncronization should be done by grid.sync() calls - 'skin': used when updating nblist - 'UtilizeNIII': Boolean indicating if Newton's third law (NIII) should be utilized (see pairpotential_calculator). - 'nblist' : 'N squared' (default) or 'linked lists'. Determines algorithm updating nblist """ N = configuration.N # Get relevant info about the device. At some point we should be able to deal with no device (GPU) available if os.getenv("NUMBA_ENABLE_CUDASIM") != "1": # Trying to handle no device (GPU) case # NUMBA_ENABLE_CUDASIM environment variable is set to "1" if the cuda simulator is used. # See: https://numba.pydata.org/numba-doc/dev/cuda/simulator.html device = cuda.get_current_device() # Apperently we can't ask the device about how many cores it has, neither in total or per SM (Streaming Processor), # so we read the latter from a stored dictionary dependent on the compute capability. from gamdpy.cc_cores_per_SM_dict import cc_cores_per_SM_dict if device.compute_capability in cc_cores_per_SM_dict: cc_cores_per_SM = cc_cores_per_SM_dict[device.compute_capability] else: print('gamdpy WARNING: Could not find cc_cores_per_SM for this compute_capability. Guessing: 128') cc_cores_per_SM = 128 num_SM = device.MULTIPROCESSOR_COUNT num_cc_cores = cc_cores_per_SM * num_SM warpsize = device.WARP_SIZE else: # Sets up the behaviour in case the GPU simulator is active and set num_cc_cores = number of threads num_SM = 1 num_cc_cores = numba.get_num_threads() warpsize = 1 # pb: particle per (thread) block pb = 512 while N // pb < 2 * num_SM and pb >= 8: # Performance heuristic pb = pb // 2 if pb < 8: pb = 8 if pb > 256: pb = 256 # tp: threads per particle tp = 1 while N * tp < 2 * num_cc_cores: # Performance heuristic (conservative) tp += 1 while (pb * tp) % warpsize != 0: # Number of threads per thread-block should be multiplum of warpsize tp += 1 if tp > 16: tp = 16 # skin: used when updating nblist skin = 0.5 if N > 6 * 1024: skin = np.float32( 1.0) # make the nblist be valid for many steps for large N. # UtilizeNIII: Boolean flag indicating if Newton's third law (NIII) should be utilized (see pairpotential_calculator). # Utilization of NIII is implemented by using atomic add's to the force array, # so it is inefficient at small system sizes where a lot of conflicts occur. UtilizeNIII = True if N < 16 * 1024: UtilizeNIII = False # gridsync: Bolean flag indicating whether synchronization should be done via grid.sync() gridsync = True if N * tp > 4 * num_cc_cores: # Heuristic gridsync = False nblist = 'N squared' if N > 8_000: # Heuristic nblist = 'linked lists' skin = 0.5 return {'pb': pb, 'tp': tp, 'skin': skin, 'UtilizeNIII': UtilizeNIII, 'gridsync': gridsync, 'nblist': nblist}