Source code for gamdpy.simulation.get_default_compute_plan
import numpy as np
import numba
from numba import cuda
import math
import matplotlib.pyplot as plt
import os
[docs]
def get_default_compute_plan(configuration):
""" Return a default compute_plan
The default compute_plan is a dictionary with a set of parameters specifying how computations are done on the GPU.
The returned plan depends on the number of particles, and properties of the GPU. The keys of the dictionary are:
- 'pb': particle per thread block
- 'tp': threads per particle
- 'gridsync': Boolean indicating if syncronization should be done by grid.sync() calls
- 'skin': used when updating nblist
- 'UtilizeNIII': Boolean indicating if Newton's third law (NIII) should be utilized (see pairpotential_calculator).
- 'nblist' : 'N squared' (default) or 'linked lists'. Determines algorithm updating nblist
"""
N = configuration.N
# Get relevant info about the device. At some point we should be able to deal with no device (GPU) available
if os.getenv("NUMBA_ENABLE_CUDASIM") != "1":
# Trying to handle no device (GPU) case
# NUMBA_ENABLE_CUDASIM environment variable is set to "1" if the cuda simulator is used.
# See: https://numba.pydata.org/numba-doc/dev/cuda/simulator.html
device = cuda.get_current_device()
# Apperently we can't ask the device about how many cores it has, neither in total or per SM (Streaming Processor),
# so we read the latter from a stored dictionary dependent on the compute capability.
from gamdpy.cc_cores_per_SM_dict import cc_cores_per_SM_dict
if device.compute_capability in cc_cores_per_SM_dict:
cc_cores_per_SM = cc_cores_per_SM_dict[device.compute_capability]
else:
print('gamdpy WARNING: Could not find cc_cores_per_SM for this compute_capability. Guessing: 128')
cc_cores_per_SM = 128
num_SM = device.MULTIPROCESSOR_COUNT
num_cc_cores = cc_cores_per_SM * num_SM
warpsize = device.WARP_SIZE
else: # Sets up the behaviour in case the GPU simulator is active and set num_cc_cores = number of threads
num_SM = 1
num_cc_cores = numba.get_num_threads()
warpsize = 1
# pb: particle per (thread) block
pb = 512
while N // pb < 2 * num_SM and pb >= 8: # Performance heuristic
pb = pb // 2
if pb < 8:
pb = 8
if pb > 256:
pb = 256
# tp: threads per particle
tp = 1
while N * tp < 2 * num_cc_cores: # Performance heuristic (conservative)
tp += 1
while (pb * tp) % warpsize != 0: # Number of threads per thread-block should be multiplum of warpsize
tp += 1
if tp > 16:
tp = 16
# skin: used when updating nblist
skin = 0.5
if N > 6 * 1024:
skin = np.float32( 1.0) # make the nblist be valid for many steps for large N.
# UtilizeNIII: Boolean flag indicating if Newton's third law (NIII) should be utilized (see pairpotential_calculator).
# Utilization of NIII is implemented by using atomic add's to the force array,
# so it is inefficient at small system sizes where a lot of conflicts occur.
UtilizeNIII = True
if N < 16 * 1024:
UtilizeNIII = False
# gridsync: Bolean flag indicating whether synchronization should be done via grid.sync()
gridsync = True
if N * tp > 4 * num_cc_cores: # Heuristic
gridsync = False
nblist = 'N squared'
if N > 8_000: # Heuristic
nblist = 'linked lists'
skin = 0.5
return {'pb': pb, 'tp': tp, 'skin': skin,
'UtilizeNIII': UtilizeNIII, 'gridsync': gridsync, 'nblist': nblist}