Source code for gamdpy.simulation.Simulation

import numpy as np
import numba
import math
import sys
from numba import cuda

# gamdpy
import gamdpy as gp

# For type annotation
from gamdpy.integrators import Integrator
from gamdpy.interactions import Interaction
from gamdpy.runtime_actions import RuntimeAction

# TODO: to remove import above you need to add a lot of lines are the following
#from ..simulation.get_default_compute_plan import get_default_compute_plan
#from ..configuration.Configuration import Configuration

# IO
import h5py



[docs]
class Simulation():
    """ Class for running a simulation.

    Parameters
    ----------

    configuration : ~gamdpy.Configuration
        The configuration to simulate.

    interactions : one or a list of interactions
        One or a list of interactions such as pair potentials, bonds, external fields, etc. See the :ref:`interactions` section for more.

    integrator : an integrator
        The integrator to use for the simulation. See the :ref:`integrators` section for more.

    runtime_actions : list of runtime actions
        List of runtime actions.
        See the :ref:`runtime_actions` section for more.

    num_timeblocks : int
        Number of timeblocks of run the simulation.

    steps_per_timeblock : int
        Number of steps in each timeblock.

    compute_plan : dict
        A dictionary with the compute plan for the simulation. If None, a default compute plan is used.

    storage : str
        Storage for the simulation output. Can be 'memory' or a filename with extension '.h5'.

    timing : bool
        If True, timing information is saved.


    See also
    --------

    :func:`gamdpy.get_default_sim`

    """

    def __init__(self, configuration: gp.Configuration, 
                 interactions: Interaction|list[Interaction], 
                 integrator: Integrator,
                 runtime_actions: list[RuntimeAction],
                 num_timeblocks, steps_per_timeblock,
                 storage: str,
                 compute_plan=None,
                 timing=True,
                 steps_in_kernel_test=1):

        self.configuration = configuration
        if compute_plan == None:
            self.compute_plan = gp.get_default_compute_plan(self.configuration)
        else:
            self.compute_plan = compute_plan

        # Integrator
        if type(interactions) == list:
            self.interactions = interactions
        else:
            self.interactions = [interactions, ]
        self.integrator = integrator
        self.dt = self.integrator.dt

        self.num_blocks = num_timeblocks
        self.current_block = -1

        self.steps_per_block = steps_per_timeblock
        self.storage = storage
        self.timing = timing
        self.steps_in_kernel_test = steps_in_kernel_test

        # Close output object if there
        # Check https://stackoverflow.com/questions/610883/how-to-check-if-an-object-has-an-attribute
        # Create output objects
        if self.storage == 'memory':
            # Creates a memory h5 file with named id(self).h5; id(self) is ensured to be unique
            self.memory = h5py.File(f"{id(self)}.h5", "w", driver='core', backing_store=False)
        elif isinstance(self.storage, str) and self.storage[-3:] == '.h5':
            # The append is important for repeated istances of sim with same self.storage
            self.memory = h5py.File(self.storage, "w")
        else:
            raise ValueError(f"storage needs to be either 'memory' or an hdf5 filename ending in '.h5' (got: {storage})")
 
        # Save setup info
        self.memory.attrs['dt'] = self.dt
        if 'script_name' not in self.memory.keys():
            script_name = sys.argv[0]
            self.memory.attrs['script_name'] = script_name
            if isinstance(script_name,str) and script_name != '':
                with open(script_name, 'r') as file:
                    script_content = file.read()
                self.memory.attrs['script_content'] = script_content

        # Saving initial configuration
        self.configuration.save(output=self.memory, group_name="initial_configuration", mode="w",
                update_ptype=True, update_topology=True)
        
        self.runtime_actions = runtime_actions

        compute_flags = None
        for runtime_action in self.runtime_actions:
            if runtime_action.get_compute_flags() is not None:
                if compute_flags is not None:
                    raise ValueError('Can not handle more than one compute_flags in runtime_actions')
                else:
                    compute_flags = runtime_action.get_compute_flags()

        self.compute_flags = gp.get_default_compute_flags() # configuration.compute_flags
        if compute_flags is not None:
            # only keys present in the default are processed
            for k in compute_flags:
                if k in self.compute_flags:
                    self.compute_flags[k] = compute_flags[k]
                else:
                    raise ValueError('Unknown key in compute_flags:%s' %k)
        for k in self.compute_flags:
            if self.compute_flags[k] and not configuration.compute_flags[k]:
                raise ValueError('compute_flags["%s]" set for Simulation but not in Configuration' % k)

        for runtime_action in self.runtime_actions:
            runtime_action.setup(configuration=self.configuration, num_timeblocks=num_timeblocks,
                                steps_per_timeblock=steps_per_timeblock, output=self.memory )

        self.vectors_list = []
        self.scalars_list = []
        self.simbox_data_list = []

        self.JIT_and_test_kernel()
        for interaction in self.interactions: # Attempt to catch (eg.) nblist errors before actually doing any simulation
            interaction.check_datastructure_validity()


        if self.storage and self.storage[-3:] == '.h5':
            self.memory.close()

    # __del__ is supposed to work also if __init__ fails. This means you can't use attributed defined in __init__
    # https://www.algorithm.co.il/programming/python-gotchas-1-__del__-is-not-the-opposite-of-__init__/

    def get_output(self, mode="r"):
        if self.storage and self.storage[-3:] == '.h5':
            output = h5py.File(self.storage, mode)
        elif self.storage == 'memory':
            output = self.memory
        else:
            #print("Warning: self.output can't recognize self.storage option, returning None")
            output = None
        return output

    # might be worth looking into cache_property https://www.reddit.com/r/learnpython/comments/184kqzp/in_class_properties_defined_in_functions/
    @property
    def output(self):
        return self.get_output()

    def JIT_and_test_kernel(self, adjust_compute_plan=True):
        while True:
            try:
                self.get_kernels_and_params()
                self.integrate = self.make_integrator(self.configuration, self.integrator_kernel, self.interactions_kernel,
                                                #self.output_calculator_kernel,
                                                self.runtime_actions_prestep_kernel,
                                                self.runtime_actions_poststep_kernel,
                                                self.compute_plan, True)
                self.configuration.copy_to_device() # By _not_ copying back to host later we dont change configuration
                self.integrate_self(0.0, self.steps_in_kernel_test)
                break
            except numba.cuda.cudadrv.driver.CudaAPIError as e:
                if not adjust_compute_plan:
                    self.compute_plan['tp'] = 0 # Signal failure to autotuner
                    break
                #print('Failed compute_plan : ', self.compute_plan)
                if self.compute_plan['tp'] > 1:             # Most common problem tp is too big
                    self.compute_plan['tp'] -= 1            # ... so we reduce it and try again
                elif self.compute_plan['gridsync'] == True: # Last resort: turn off gridsync
                    self.compute_plan['gridsync'] = False
                else:
                    print(f'FAILURE. Can not handle cuda error {e}')
                    exit()
                print('Trying adjusted compute_plan :', self.compute_plan)

    def get_kernels_and_params(self, verbose=False):
        # Interactions
        self.interactions_kernel, self.interactions_params = gp.add_interactions_list(self.configuration,
                                                                                      self.interactions,
                                                                                      compute_plan=self.compute_plan,
                                                                                      compute_flags=self.compute_flags)

        # Runtime actions
        if self.runtime_actions:
            self.runtime_actions_prestep_kernel, self.runtime_actions_poststep_kernel, self.runtime_actions_params = gp.add_runtime_actions_list(self.configuration,
                                                                                                    self.runtime_actions,
                                                                                                    compute_plan=self.compute_plan)
        else:
            self.runtime_actions_prestep_kernel = None
            self.runtime_actions_poststep_kernel = None
            self.runtime_actions_params = (0,)

        # Integrator
        self.integrator_params = self.integrator.get_params(self.configuration, self.interactions_params)
        self.integrator_kernel = self.integrator.get_kernel(self.configuration, self.compute_plan, self.compute_flags, self.interactions_kernel)

        return

    def update_params(self, verbose=False):
        # Interactions
        _, self.interactions_params = gp.add_interactions_list(self.configuration,
                                                                self.interactions,
                                                                compute_plan=self.compute_plan,
                                                                compute_flags=self.compute_flags)

        # Runtime actions
        if self.runtime_actions:
            _, _, self.runtime_actions_params = gp.add_runtime_actions_list(self.configuration,
                                                                                                    self.runtime_actions,
                                                                                                    compute_plan=self.compute_plan)
        else:
            #self.runtime_actions_kernel = None
            self.runtime_actions_params = (0,)

        # Integrator
        self.integrator_params = self.integrator.get_params(self.configuration, self.interactions_params, verbose)
        #self.integrator_kernel = self.integrator.get_kernel(self.configuration, self.compute_plan, verbose)

        return

    def integrate_self(self, time_zero, steps):
        self.integrate(self.configuration.d_vectors,
                           self.configuration.d_scalars,
                           self.configuration.d_ptype,
                           self.configuration.d_r_im,
                           self.configuration.simbox.d_data,
                           self.interactions_params,
                           self.integrator_params,
                           self.runtime_actions_params,
                           np.float32(time_zero),
                           steps)
        return


    def make_integrator(self, configuration, integration_step, compute_interactions,# output_calculator_kernel,
                        runtime_actions_prestep_kernel, runtime_actions_poststep_kernel, compute_plan, verbose=True):
        
        # Unpack parameters from configuration and compute_plan
        D, num_part = configuration.D, configuration.N
        pb, tp, gridsync = [compute_plan[key] for key in ['pb', 'tp', 'gridsync']]
        num_blocks = (num_part - 1) // pb + 1

        get_integrator_setup = getattr(self.integrator, "get_setup_kernel", None)
        if get_integrator_setup is not None:
            integrator_setup = get_integrator_setup(self.configuration, self.compute_plan, self.interactions_kernel)
            self.configuration.copy_to_device()
            integrator_setup(
                self.configuration.d_vectors,
                self.configuration.d_scalars,
                self.configuration.d_r_im,
                self.configuration.simbox.d_data,
                self.integrator_params,
                self.configuration.d_ptype,
            )

        if gridsync:
            # Return a kernel that does 'steps' timesteps, using grid.sync to syncronize   
            @cuda.jit
            def integrator(vectors, scalars, ptype, r_im, sim_box, interaction_params, integrator_params,
                            runtime_actions_params, time_zero, steps):
                grid = cuda.cg.this_grid()
                for step in range(steps + 1): # make extra step without integration, so that interactions and run_time actions called for final configuration
                    time = time_zero + step * integrator_params[0]
                    compute_interactions(grid, vectors, scalars, ptype, sim_box, interaction_params)
                    grid.sync()
                    if runtime_actions_prestep_kernel != None:
                        runtime_actions_prestep_kernel(grid, vectors, scalars, r_im, sim_box, step, runtime_actions_params)
                        grid.sync()
                    if step<steps:
                        integration_step(grid, vectors, scalars, r_im, sim_box, integrator_params, time, ptype)
                        grid.sync()
                        if runtime_actions_poststep_kernel != None:
                            runtime_actions_poststep_kernel(grid, vectors, scalars, r_im, sim_box, step, runtime_actions_params)
                            grid.sync()

                return

            return integrator[num_blocks, (pb, tp)]

        else:

            # Return a Python function that does 'steps' timesteps, using kernel calls to syncronize  
            def integrator(vectors, scalars, ptype, r_im, sim_box, interaction_params, integrator_params,
                           runtime_actions_params, time_zero, steps):
                for step in range(steps + 1): # make extra step without integration, so that interactions and run_time actions called for final configuration
                    time = time_zero + step * integrator_params[0]
                    compute_interactions(0, vectors, scalars, ptype, sim_box, interaction_params)
                    if runtime_actions_prestep_kernel != None:
                        runtime_actions_prestep_kernel(0, vectors, scalars, r_im, sim_box, step, runtime_actions_params)
                    if step<steps:
                        integration_step(0, vectors, scalars, r_im, sim_box, integrator_params, time, ptype)
                        if runtime_actions_poststep_kernel != None:
                            runtime_actions_poststep_kernel(0, vectors, scalars, r_im, sim_box, step, runtime_actions_params)
                return

            return integrator
        return

        # simple run function


[docs]
    def run(self, verbose=True) -> None:
        """ Run all blocks of the simulation.
        See :func:`gamdpy.Simulation.run_timeblocks` for an open loop version.

        """
        for _ in self.run_timeblocks():
            if verbose:
                print(self.status(per_particle=True))
        if verbose:
            print(self.summary())


    # generator for running simulation one block at a time

[docs]
    def run_timeblocks(self, num_timeblocks=-1):
        """ Generator for running the simulation one block at a time.
        The state of the simulation object is updated between block,
        and data is copied to the host for (optional) data analysis.

        Parameters
        ----------

        num_timeblocks : int
            Number of blocks to run. All blocks are run if -1 (recommended).

        Yields
        ------
        int
            Index of the current block.

        Examples
        --------

        >>> import gamdpy as gp
        >>> sim = gp.get_default_sim()  # Replace this with your own simulation object
        >>> for block in sim.run_timeblocks():
        ...     print(f'{block=}')  # Replace this with code to analyze the current configuration
        block=0
        block=1
        block=2
        block=3
        block=4
        block=5
        block=6
        block=7


        See also
        --------

        :func:`gamdpy.Simulation.run`

        """
        if num_timeblocks == -1:
            num_timeblocks = self.num_blocks
        self.last_num_blocks = num_timeblocks
        assert (num_timeblocks <= self.num_blocks)  # Could be made OK with more blocks

        self.configuration.copy_to_device()
        self.vectors_list = []
        self.scalars_list = []
        self.simbox_data_list = []
        self.scalars_t = []

        if self.timing:
            start = cuda.event()
            end = cuda.event()
            start_block = cuda.event()
            end_block = cuda.event()
            block_times = []
            start.record()

        zero = np.float32(0.0)

        for block in range(num_timeblocks):

            self.current_block = block
            for runtime_action in self.runtime_actions:
                runtime_action.initialize_before_timeblock(block, self.get_output(mode="a"))
            
            if self.timing: 
                start_block.record()

            self.integrate_self(np.float32(block * self.steps_per_block * self.dt),self.steps_per_block)

            if self.timing:
                end_block.record()
                end_block.synchronize()
                block_times.append(cuda.event_elapsed_time(start_block, end_block))

            self.configuration.copy_to_host()

            for interaction in self.interactions:
                interaction.check_datastructure_validity()

            for runtime_action in self.runtime_actions:
                runtime_action.update_at_end_of_timeblock(block, self.get_output(mode="a"))

            #if self.storage:
            #    self.configuration.save(output=self.get_output(mode="a"), group_name=f"/restarts/restart{block:04d}", mode="w", include_topology=True)

            if self.storage and self.storage[-3:] == '.h5':
                self.output.close()

            yield block

        # Finalizing run
        if self.timing:
            end.record()
            end.synchronize()

            self.timing_numba = cuda.event_elapsed_time(start, end)
            self.timing_numba_blocks = np.array(block_times)
        #self.nbflag = self.interactions[0].nblist.d_nbflag.copy_to_host()
        self.scalars_list = np.array(self.scalars_list)



[docs]
    def status(self, per_particle=False) -> str:
        """ String with the current status
        Should be executed during the simulation run, see :func:`gamdpy.Simulation.timeblocks`

        Parameters
        ----------

        per_particle : bool
            If True, the values are divided by the number of particles in the configuration.

        Returns
        -------

        str
            A string with the current status of the simulation.

        """
        time = (self.current_block+1) * self.steps_per_block * self.dt
        st = f'timeblock= {self.current_block :<6}'
        st += f'{time= :<12.3f}'
        for name in self.configuration.sid:
            if name in self.configuration.compute_flags and self.configuration.compute_flags[name]:
                data = np.sum(self.configuration[name])
                if per_particle:
                    data /= self.configuration.N
                st += f'{name}= {data:<10.3f}'
        return st



[docs]
    def summary(self) -> str:
        """ Returns a summary string of the simulation run.
         Should be called after the simulation has been run,
         see :func:`~gamdpy.Simulation.run_timeblocks`.
         """
        if self.timing:
            time_total = self.timing_numba / 1000
            tps_total = self.last_num_blocks * self.steps_per_block / time_total
            time_sim = np.sum(self.timing_numba_blocks) / 1000
            tps_sim = self.last_num_blocks * self.steps_per_block / time_sim

        st = f'Particles : {self.configuration.N} \n'
        st += f'Steps : {self.last_num_blocks} * {self.steps_per_block} = '
        st += f'{self.last_num_blocks * self.steps_per_block:_} \n'
        if self.timing:
            st += f'Total run time  (incl. time spent between timeblocks): {time_total:.2f} s '
            st += f'( TPS: {tps_total:.2e} )\n'
            st += f'Simulation time (excl. time spent between timeblocks): {time_sim:.2f} s '
            st += f'( TPS: {tps_sim:.2e} )\n'
        return st


    def autotune_bruteforce(self, pbs='auto', skins='auto', tps='auto', timesteps=0, repeats=1, verbose=False):
        if verbose:
            print('compute_plan :', self.compute_plan)
        if timesteps==0: 
            timesteps = self.steps_per_block
        assert timesteps<=self.steps_per_block
        
        pb = self.compute_plan['pb']
        if pbs=='auto':
            pbs = [pb//2, pb, pb*2]
        if pbs=='default':
            pbs = [pb,]
        if verbose:
            print('pbs :', pbs)

        tp = self.compute_plan['tp']
        if tps=='auto': 
            tps = [tp - 3, tp - 2, tp - 1, tp, tp + 1, tp + 2, tp + 3,]
        if tps=='default':
            tps = [tp,]
        if verbose:
            print('tps :', tps)

        skin = self.compute_plan['skin']
        if skins=='auto':
            skins = [skin - 0.6, skin - 0.4, skin - 0.2, skin, skin + 0.2, skin + 0.4, skin + 0.6, skin + 0.8,  skin + 1.0] 
        elif skins=='default':
            skins = [skin, ]
        if verbose:
            print('skins :', skins)

        nblists = []
        if self.configuration.N < 32000:
            nblists.append('N squared')
        if self.configuration.N > 2000:
            nblists.append('linked lists')
        if verbose:
            print('nblists:', nblists)

        gridsyncs = []
        if self.configuration.N < 200000:
            gridsyncs.append(True)
        if self.configuration.N > 10000:
            gridsyncs.append(False)
        if verbose:
            print('gridsyncs :', gridsyncs)
            
        UtilizeNIIIs = [False, True]
        if verbose:
            print('UtilizeNIIIs :', UtilizeNIIIs)
            
        flag = cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS
        cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = False
        
        skin_times = []
        total_min_time = 1e9
        optimal_compute_plan = gp.get_default_compute_plan(self.configuration) # Overwritten below

        for nblist in nblists:
            self.compute_plan['nblist'] = nblist
            for gridsync in gridsyncs:
                self.compute_plan['gridsync'] = gridsync
                for UtilizeNIII in UtilizeNIIIs:
                    self.compute_plan['UtilizeNIII'] = UtilizeNIII
                    print(f'\n {nblist}, {gridsync=}, {UtilizeNIII=}: ', end='')
                    local_min_time = 1e9
                    for pb in pbs:
                        if pb <= 512:
                            self.compute_plan['pb'] = pb
                            for tp in tps:
                                if tp>0:
                                    self.compute_plan['tp'] = tp
                                    gridsync = self.compute_plan['gridsync']
                                    self.JIT_and_test_kernel(adjust_compute_plan=False)
                                    # does kernel run without adjustment?
                                    if self.compute_plan['tp'] != tp or self.compute_plan['gridsync'] != gridsync: 
                                        break
                                    #print('Seems to work, so looping over skins...')
                                    total_min_time, local_min_time = self.autotune_scan_skin(self.compute_plan, skins, timesteps, repeats, total_min_time, local_min_time, optimal_compute_plan, verbose)

        self.compute_plan = optimal_compute_plan
        if verbose:
            print('\nFinal compute_plan :', self.compute_plan)
        else:
            print('')
            #print('\nFinal compute_plan :', self.compute_plan)
        self.JIT_and_test_kernel()
        
        cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = flag


    def autotune_scan_skin(self, compute_plan, skins, timesteps, repeats, total_min_time, local_min_time, optimal_compute_plan, verbose=False):
        min_time = 1e9
        skin_times = []
        pb = compute_plan['pb']
        tp = compute_plan['tp']
        for skin in skins:
            if 0 < skin < 1.2:
                self.compute_plan['skin'] = skin
                self.update_params()
                self.configuration.copy_to_device() # By _not_ copying back to host later we dont change configuration
                start = cuda.event()
                end = cuda.event()
                start.record()
                for i in range(repeats):
                    self.integrate_self(0.0, timesteps)
                end.record()
                end.synchronize()
                time_elapsed = cuda.event_elapsed_time(start, end)
                if time_elapsed < min_time:
                    min_time = time_elapsed
                    min_skin = skin
                skin_times.append(time_elapsed)
                if verbose:
                    print(f"({skin}, {skin_times[-1]:.3})", end=' ', flush=True)
        max_TPS = repeats * timesteps / min_time * 1000
        if verbose:
            print('\n', pb, tp, min_skin, min_time, max_TPS)
        else:
            print('.', end='', flush=True)
        if min_time < local_min_time:
            local_min_time = min_time
            print(f' ({pb}x{tp},{min_skin:.2f}):{max_TPS:.2f}', end=' ', flush=True)
        if min_time < total_min_time:
            total_min_time = min_time
            optimal_compute_plan['UtilizeNIII'] = self.compute_plan['UtilizeNIII']
            optimal_compute_plan['nblist'] = self.compute_plan['nblist']
            optimal_compute_plan['gridsync'] = self.compute_plan['gridsync']
            optimal_compute_plan['skin'] = min_skin
            optimal_compute_plan['pb'] = pb
            optimal_compute_plan['tp'] = tp
        return total_min_time, local_min_time



[docs]
    def autotune(self, include_linked_lists=True):
        """ Autotune the simulation parameters for most efficient calculations on the current machine """
        flag = cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS
        cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = False

        initial_compute_plan = self.compute_plan.copy()
        timesteps = self.steps_per_block
        repeats = 1

        # Binary choises: Take self.compute_plan as starting point, and add alternatives if appropiate 
        gridsyncs = [initial_compute_plan['gridsync'], ]
        if gridsyncs[0] != True:
            gridsyncs.append(True)
        if gridsyncs[0] != False and self.configuration.N > 10000:
            gridsyncs.append(False)

        nblists = [initial_compute_plan['nblist'], ]
        if nblists[0] != 'N squared' and self.configuration.N < 32000:
            nblists.append('N squared')
        if nblists[0] != 'linked lists' and self.configuration.N > 2000 and include_linked_lists:
            nblists.append('linked lists')

        UtilizeNIIIs = [initial_compute_plan['UtilizeNIII'], ]
        if UtilizeNIIIs[0] != False:
            UtilizeNIIIs.append(False)
        if UtilizeNIIIs[0] != True:
            UtilizeNIIIs.append(True)


        pb = self.compute_plan['pb']
        pbs = [pb//2, pb, pb*2]

        optimal_compute_plan = initial_compute_plan.copy()
        results = []
        # Loop over binary parameters
        total_min_time = 1e9
        binary_min_time = 1e9
        for nblist in nblists:
            self.compute_plan['nblist'] = nblist
            for gridsync in gridsyncs:
                self.compute_plan['gridsync'] = gridsync
                for UtilizeNIII in UtilizeNIIIs:
                    self.compute_plan['UtilizeNIII'] = UtilizeNIII
                    
                    # Get time for default 'pb', 'tp' to check if its worth to proceede with these choises
                    self.compute_plan['pb'] = initial_compute_plan['pb']
                    self.compute_plan['tp'] = initial_compute_plan['tp']
                    print(f'\n {nblist+",":12}\t{gridsync=}\t{UtilizeNIII=}:\t', end='')
                    self.JIT_and_test_kernel(adjust_compute_plan=False)
                    if self.compute_plan['tp'] == 0 or self.compute_plan['gridsync'] != gridsync: 
                        continue
        
                    local_min_time = 1e9
                    total_min_time, local_min_time, min_time = self.autotune_skin(initial_compute_plan['skin'], 0.2, 
                                                                        timesteps, repeats, total_min_time, local_min_time, 
                                                                        optimal_compute_plan, verbose=False)
                    self.compute_plan['min_time'] = min_time
                    if min_time < binary_min_time:
                        binary_min_time = min_time
                    results.append(self.compute_plan.copy())
        
        for compute_plan in results:
            if compute_plan['min_time'] < 1.15 * binary_min_time:
                        
                local_min_time = 1e9
                self.compute_plan = compute_plan.copy()
                nblist = compute_plan['nblist']
                gridsync = compute_plan['gridsync']
                UtilizeNIII = compute_plan['UtilizeNIII']
                print(f'\n {nblist+",":12}\t{gridsync=}\t{UtilizeNIII=}:\t', end='')

                pb = compute_plan['pb']
                pb_min_time = 1e9
                initial_min_time = -1e9
                while 4 <= pb <= 1024:
                    self.compute_plan['pb'] = pb
                    print(f' {pb=} ', end='')
                    total_min_time, local_min_time, min_time = self.autotune_tp(compute_plan['tp'], 1, compute_plan, timesteps, repeats, optimal_compute_plan, total_min_time, local_min_time)
                    if initial_min_time<0:
                        initial_min_time = min_time
                    #print(f'[{min_time:.3}, {pb_min_time:.3}]')
                    if min_time > 1.01 * pb_min_time:
                        break
                    if min_time < pb_min_time:
                        pb_min_time = min_time
                    pb *= 2
                pb = compute_plan['pb'] // 2
                if initial_min_time < 1.01*pb_min_time: # Is it worth trying smaller pb?
                    while 4 <= pb <= 1024:
                        self.compute_plan['pb'] = pb
                        print(f' {pb=} ', end='')
                        total_min_time, local_min_time, min_time = self.autotune_tp(compute_plan['tp'], 1, compute_plan, timesteps, repeats, optimal_compute_plan, total_min_time, local_min_time)
                        #print(f'[{min_time:.3}, {pb_min_time:.3}]')
                        if min_time > 1.01 * pb_min_time:
                            break
                        if min_time < pb_min_time:
                            pb_min_time = min_time
                        pb = pb // 2

        self.compute_plan = optimal_compute_plan
        self.JIT_and_test_kernel()
        print()
        cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = flag



    def autotune_tp(self, initial_tp, delta_tp, initial_compute_plan, timesteps, repeats, optimal_compute_plan, total_min_time, local_min_time):
        tp = initial_tp
        tp_min_time = 1e9
        while 0 < tp <= 64:
            #print(self.compute_plan['pb'], tp, (self.compute_plan['pb']*tp) % 32)
            if (self.compute_plan['pb']*tp) % 32 == 0:
                self.compute_plan['tp'] = tp
                self.JIT_and_test_kernel(adjust_compute_plan=False)
                if self.compute_plan['tp'] != tp: #or self.compute_plan['gridsync'] != gridsync: 
                    break
                total_min_time, local_min_time, min_time = self.autotune_skin(initial_compute_plan['skin'], 0.2, 
                                                                    timesteps, repeats, total_min_time, local_min_time, 
                                                                    optimal_compute_plan, verbose=False)
                if min_time > 1.05 * tp_min_time:
                    break
                if min_time < tp_min_time:
                    tp_min_time = min_time
            tp += delta_tp
        tp = initial_tp - delta_tp
        while 0 < tp <= 64:
            if (self.compute_plan['pb']*tp) % 32 == 0:
                self.compute_plan['tp'] = tp
                self.JIT_and_test_kernel(adjust_compute_plan=False)
                if self.compute_plan['tp'] != tp: #or self.compute_plan['gridsync'] != gridsync: 
                    break
                total_min_time, local_min_time, min_time = self.autotune_skin(initial_compute_plan['skin'], 0.2, 
                                                                    timesteps, repeats, total_min_time, local_min_time, 
                                                                    optimal_compute_plan, verbose=False)
                if min_time > 1.05 * tp_min_time:
                    break
                if min_time < tp_min_time:
                    tp_min_time = min_time
            tp -= delta_tp
        return total_min_time, local_min_time, tp_min_time

 
    def autotune_skin(self, initial_skin, delta_skin, timesteps, repeats, total_min_time, local_min_time, optimal_compute_plan, verbose=False):
        min_time = 1e9
        min_skin = -0.1
        
        # Upscan
        min_time, min_skin = self.scan_skin(timesteps, repeats, initial_skin, delta_skin, min_time, min_skin)
        
        # Downscan
        min_time, min_skin = self.scan_skin(timesteps, repeats, initial_skin-delta_skin, -delta_skin, min_time, min_skin)
        
        max_TPS = repeats * timesteps / min_time * 1000
        if min_time < local_min_time:
            local_min_time = min_time
            print(f" ({self.compute_plan['pb']}x{self.compute_plan['tp']},{min_skin:.2f}):{max_TPS:.2f}", end=' ', flush=True)
        else:
            print('.', end='', flush=True)
        if min_time < total_min_time:
            total_min_time = min_time
            optimal_compute_plan['UtilizeNIII'] = self.compute_plan['UtilizeNIII']
            optimal_compute_plan['nblist'] = self.compute_plan['nblist']
            optimal_compute_plan['gridsync'] = self.compute_plan['gridsync']
            optimal_compute_plan['skin'] = min_skin
            optimal_compute_plan['pb'] = self.compute_plan['pb']
            optimal_compute_plan['tp'] = self.compute_plan['tp']
        return total_min_time, local_min_time, min_time

    def scan_skin(self, timesteps, repeats, skin, delta_skin, min_time, min_skin):
        while abs(delta_skin)/2 < skin < 1.45: # Should keep on going until eg. linked lists throw an error
            self.compute_plan['skin'] = skin
            self.update_params()
            self.configuration.copy_to_device() # By _not_ copying back to host later we dont change configuration
            start = cuda.event()
            end = cuda.event()
            start.record()
            for i in range(repeats):
                self.integrate_self(0.0, timesteps)
            end.record()
            end.synchronize()
            time_elapsed = cuda.event_elapsed_time(start, end)
            #print(f'{skin=}, {time_elapsed=}')
            if time_elapsed < min_time:
                min_time = time_elapsed
                min_skin = skin
            elif time_elapsed > 1.1*min_time:
                break
            skin += delta_skin
        return min_time, min_skin