_static/html/gocean__opencl__trans_8py_source.html

 # -----------------------------------------------------------------------------

 # BSD 3-Clause License

 #

 # Copyright (c) 2021-2024, Science and Technology Facilities Council.

 # All rights reserved.

 #

 # Redistribution and use in source and binary forms, with or without

 # modification, are permitted provided that the following conditions are met:

 #

 # * Redistributions of source code must retain the above copyright notice, this

 #   list of conditions and the following disclaimer.

 #

 # * Redistributions in binary form must reproduce the above copyright notice,

 #   this list of conditions and the following disclaimer in the documentation

 #   and/or other materials provided with the distribution.

 #

 # * Neither the name of the copyright holder nor the names of its

 #   contributors may be used to endorse or promote products derived from

 #   this software without specific prior written permission.

 #

 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 # POSSIBILITY OF SUCH DAMAGE.

 # -----------------------------------------------------------------------------

 # Authors R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab


 '''This module contains the GOcean-specific OpenCL transformation.

 '''


 import os


 from fparser.two import Fortran2003

 from psyclone.configuration import Config

 from psyclone.domain.common.transformations import KernelModuleInlineTrans

 from psyclone.errors import GenerationError

 from psyclone.gocean1p0 import GOInvokeSchedule, GOLoop

 from psyclone.psyGen import Transformation, args_filter, InvokeSchedule, \

     HaloExchange

 from psyclone.psyir.backend.opencl import OpenCLWriter

 from psyclone.psyir.frontend.fortran import FortranReader

 from psyclone.psyir.nodes import Routine, Call, Reference, Literal, \

     Assignment, IfBlock, ArrayReference, Schedule, BinaryOperation, \

     StructureReference, FileContainer, CodeBlock, IntrinsicCall

 from psyclone.psyir.symbols import (

     ArrayType, DataSymbol, RoutineSymbol, ContainerSymbol,

     UnsupportedFortranType, ArgumentInterface, ImportInterface,

     INTEGER_TYPE, CHARACTER_TYPE, BOOLEAN_TYPE, ScalarType)

 from psyclone.transformations import TransformationError


 class GOOpenCLTrans(Transformation):

     '''

     Switches on/off the generation of an OpenCL PSy layer for a given

     InvokeSchedule. Additionally, it will generate OpenCL kernels for

     each of the kernels referenced by the Invoke. For example:


     >>> from psyclone.parse.algorithm import parse

     >>> from psyclone.psyGen import PSyFactory

     >>> API = "gocean1.0"

     >>> FILENAME = "shallow_alg.f90" # examples/gocean/eg1

     >>> ast, invoke_info = parse(FILENAME, api=API)

     >>> psy = PSyFactory(API, distributed_memory=False).create(invoke_info)

     >>> schedule = psy.invokes.get('invoke_0').schedule

     >>> ocl_trans = GOOpenCLTrans()

     >>> ocl_trans.apply(schedule)

     >>> print(schedule.view())


     '''

     # Specify which OpenCL command queue to use for management operations like

     # data transfers when generating an OpenCL PSy-layer

     _OCL_MANAGEMENT_QUEUE = 1


     # TODO #1572: These are class attributes because multiple invokes may need

     # to generate a single OpenCL environment (e.g. to share the device data

     # pointers) and therefore guarantee the same properties, but this hasn't

     # been tested. PSycloneBench ShallowWater could be an example of this.


     # Biggest queue number that any kernel is allocated to. The OpenCL

     # environment should be set up with at least this number of queues.

     _max_queue_number = 1

     # Whether to enable the profiling option in the OpenCL environment

     _enable_profiling = False

     # Whether to enable the out_of_order option in the OpenCL environment

     _out_of_order = False

     # Total number of invokes that have been transformed to OpenCL

     _transformed_invokes = 0

     # Reference to the OpenCL kernels file

     _kernels_file = None


     @property

     def name(self):

         '''

         :returns: the name of this transformation.

         :rtype: str

         '''

         return "GOOpenCLTrans"


     def validate(self, node, options=None):

         '''

         Checks that the supplied InvokeSchedule is valid and that an OpenCL

         version of it can be generated.


         :param node: the Schedule to check.

         :type node: :py:class:`psyclone.psyGen.InvokeSchedule`

         :param options: a dictionary with options for transformations.

         :type options: dict of str:value or None

         :param bool options["enable_profiling"]: whether or not to set up the \

                 OpenCL environment with the profiling option enabled.

         :param bool options["out_of_order"]: whether or not to set up the \

                 OpenCL environment with the out_of_order option enabled.

         :param bool options["end_barrier"]: whether or not to add an OpenCL \

                 barrier at the end of the transformed invoke.


         :raises TransformationError: if the InvokeSchedule is not for the \

                                      GOcean1.0 API.

         :raises TransformationError: if any of the kernels have arguments \

                                      which are passed as a literal.

         :raises TransformationError: if any of the provided options is invalid.

         :raises TransformationError: if any of the provided options is not \

                                      compatible with a previous OpenCL

                                      environment.

         :raises TransformationError: if any kernel in this invoke has a \

                                      global variable used by an import.

         :raises TransformationError: if any kernel does not iterate over \

                                      the whole grid.

         '''


         if isinstance(node, InvokeSchedule):

             if not isinstance(node, GOInvokeSchedule):

                 raise TransformationError(

                     f"OpenCL generation is currently only supported for the "

                     f"GOcean API but got an InvokeSchedule of type: "

                     f"'{type(node).__name__}'")

         else:

             raise TransformationError(

                 f"Error in GOOpenCLTrans: the supplied node must be a (sub-"

                 f"class of) InvokeSchedule but got {type(node)}")


         # Validate options map

         valid_options = ['end_barrier', 'enable_profiling', 'out_of_order']

         for key, value in options.items():

             if key in valid_options:

                 # All current options should contain boolean values

                 if not isinstance(value, bool):

                     raise TransformationError(

                         f"InvokeSchedule OpenCL option '{key}' should be a "

                         f"boolean.")

             else:

                 raise TransformationError(

                     f"InvokeSchedule does not support the OpenCL option "

                     f"'{key}'. The supported options are: {valid_options}.")


         # Validate that the options are valid with previously generated OpenCL

         if self._transformed_invokes_transformed_invokes > 0:

             if ('enable_profiling' in options and

                     self._enable_profiling_enable_profiling_enable_profiling != options['enable_profiling']):

                 raise TransformationError(

                     f"Can't generate an OpenCL Invoke with enable_profiling='"

                     f"{options['enable_profiling']}' since a previous "

                     f"transformation used a different value, and their OpenCL"

                     f" environments must match.")


             if ('out_of_order' in options and

                     self._out_of_order_out_of_order_out_of_order != options['out_of_order']):

                 raise TransformationError(

                     f"Can't generate an OpenCL Invoke with out_of_order='"

                     f"{options['out_of_order']}' since a previous "

                     f"transformation used a different value, and their OpenCL "

                     f"environments must match.")


         # Now we need to check that none of the invoke arguments is a literal

         args = args_filter(node.args, arg_types=["scalar"])

         for arg in args:

             if arg.is_literal:

                 raise TransformationError(

                     f"Cannot generate OpenCL for Invokes that contain kernel "

                     f"arguments which are a literal, but found the literal "

                     f"'{arg.name}' used as an argument in invoke "

                     f"'{node.name}'.")


         # Check that we can construct the PSyIR and SymbolTable of each of

         # the kernels in this Schedule. Also check that none of them access

         # any form of global data (that is not a routine argument).

         for kern in node.kernels():

             KernelModuleInlineTrans().validate(kern)

             ksched = kern.get_kernel_schedule()

             global_variables = ksched.symbol_table.imported_symbols

             if global_variables:

                 raise TransformationError(

                     f"The Symbol Table for kernel '{kern.name}' contains the "

                     f"following symbols with 'global' scope: "

                     f"{[sym.name for sym in global_variables]}. An OpenCL "

                     f"kernel cannot call other kernels and all of the data it "

                     f"accesses must be passed by argument. Use the "

                     f"KernelImportsToArguments transformation to convert such "

                     f"symbols to kernel arguments first.")


         # In OpenCL all kernel loops should iterate the whole grid

         for kernel in node.kernels():

             inner_loop = kernel.ancestor(GOLoop)

             outer_loop = inner_loop.ancestor(GOLoop)

             if not (inner_loop.field_space == "go_every" and

                     outer_loop.field_space == "go_every" and

                     inner_loop.iteration_space == "go_all_pts" and

                     outer_loop.iteration_space == "go_all_pts"):

                 raise TransformationError(

                     f"The kernel '{kernel.name}' does not iterate over all "

                     f"grid points. This is a necessary requirement for "

                     f"generating the OpenCL code and can be done by applying "

                     f"the GOMoveIterationBoundariesInsideKernelTrans to each "

                     f"kernel before the GOOpenCLTrans.")


     def apply(self, node, options=None):

         '''

         Apply the OpenCL transformation to the supplied GOInvokeSchedule. This

         causes PSyclone to generate an OpenCL version of the corresponding

         PSy-layer routine. The generated code makes use of the FortCL

         library (https://github.com/stfc/FortCL) in order to manage the

         OpenCL device directly from Fortran.


         :param node: the InvokeSchedule to transform.

         :type node: :py:class:`psyclone.psyGen.GOInvokeSchedule`

         :param options: set of option to tune the OpenCL generation.

         :type options: dict of str:value or None

         :param bool options["enable_profiling"]: whether or not to set up the \

                 OpenCL environment with the profiling option enabled.

         :param bool options["out_of_order"]: whether or not to set up the \

                 OpenCL environment with the out_of_order option enabled.

         :param bool options["end_barrier"]: whether or not to add an OpenCL \

                 barrier at the end of the transformed invoke.


         '''

         if not options:

             options = {}


         self.validatevalidatevalidate(node, options)

         api_config = Config.get().api_conf("gocean1.0")


         # Update class attributes

         if 'enable_profiling' in options:

             self._enable_profiling_enable_profiling_enable_profiling = options['enable_profiling']


         if 'out_of_order' in options:

             self._out_of_order_out_of_order_out_of_order = options['out_of_order']


         self._transformed_invokes_transformed_invokes += 1


         # Get end_barrier option

         end_barrier = options.get('end_barrier', True)


         # Update the maximum value that the queue_number have.

         for kernel in node.coded_kernels():

             self._max_queue_number_max_queue_number_max_queue_number = max(self._max_queue_number_max_queue_number_max_queue_number,

                                          kernel.opencl_options["queue_number"])


         # Insert, if they don't already exist, the necessary OpenCL helper

         # subroutines in the root Container.

         psy_init = self._insert_opencl_init_routine_insert_opencl_init_routine(node.root)

         init_grid = self._insert_initialise_grid_buffers_insert_initialise_grid_buffers(node.root)

         write_grid_buf = self._insert_write_grid_buffers_insert_write_grid_buffers(node.root)

         self._insert_ocl_read_from_device_function_insert_ocl_read_from_device_function(node.root)

         self._insert_ocl_write_to_device_function_insert_ocl_write_to_device_function(node.root)

         init_buf = self._insert_ocl_initialise_buffer_insert_ocl_initialise_buffer(node.root)


         for kern in node.coded_kernels():

             self._insert_ocl_arg_setter_routine_insert_ocl_arg_setter_routine(node.root, kern)


         # Insert fortcl, clfortran and c_iso_binding import statement

         fortcl = ContainerSymbol("fortcl")

         node.symbol_table.add(fortcl)

         get_num_cmd_queues = RoutineSymbol(

                 "get_num_cmd_queues", interface=ImportInterface(fortcl))

         get_cmd_queues = RoutineSymbol(

                 "get_cmd_queues", interface=ImportInterface(fortcl))

         get_kernel_by_name = RoutineSymbol(

                 "get_kernel_by_name", interface=ImportInterface(fortcl))

         node.symbol_table.add(get_num_cmd_queues)

         node.symbol_table.add(get_cmd_queues)

         node.symbol_table.add(get_kernel_by_name)

         clfortran = ContainerSymbol("clfortran")

         node.symbol_table.add(clfortran)

         cl_finish = RoutineSymbol(

                 "clFinish", interface=ImportInterface(clfortran))

         cl_launch = RoutineSymbol(

                 "clEnqueueNDRangeKernel",

                 interface=ImportInterface(clfortran))

         node.symbol_table.add(cl_finish)

         node.symbol_table.add(cl_launch)

         iso_c_binding = ContainerSymbol("iso_c_binding")

         node.symbol_table.add(iso_c_binding)

         c_loc = RoutineSymbol(

                 "C_LOC", interface=ImportInterface(iso_c_binding))

         c_null = DataSymbol(

                 "C_NULL_PTR", datatype=INTEGER_TYPE,

                 interface=ImportInterface(iso_c_binding))

         node.symbol_table.add(c_loc)

         node.symbol_table.add(c_null)


         # Include the check_status subroutine if we are in debug_mode

         if api_config.debug_mode:

             ocl_utils = ContainerSymbol("ocl_utils_mod")

             check_status = RoutineSymbol(

                 "check_status", interface=ImportInterface(ocl_utils))

             node.symbol_table.add(ocl_utils)

             node.symbol_table.add(check_status)


         # Declare local variables needed by an OpenCL PSy-layer invoke

         qlist = node.symbol_table.new_symbol(

             "cmd_queues", symbol_type=DataSymbol,

             datatype=UnsupportedFortranType(

                 "integer(kind=c_intptr_t), pointer, save :: cmd_queues(:)"),

             tag="opencl_cmd_queues")

         # 'first_time' needs to be an UnsupportedFortranType because it has

         # SAVE and initial value

         first = DataSymbol("first_time",

                            datatype=UnsupportedFortranType(

                                "logical, save :: first_time = .true."))

         node.symbol_table.add(first, tag="first_time")

         flag = node.symbol_table.new_symbol(

             "ierr", symbol_type=DataSymbol, datatype=INTEGER_TYPE,

             tag="opencl_error")

         global_size = node.symbol_table.new_symbol(

             "globalsize", symbol_type=DataSymbol,

             datatype=UnsupportedFortranType(

                 "integer(kind=c_size_t), target :: globalsize(2)"))

         local_size = node.symbol_table.new_symbol(

             "localsize", symbol_type=DataSymbol,

             datatype=UnsupportedFortranType(

                 "integer(kind=c_size_t), target :: localsize(2)"))


         # Bring all the boundaries at the beginning (since we are going to

         # use them during the setup block - and they don't change)

         boundary_vars = []

         for tag, symbol in node.symbol_table.tags_dict.items():

             if tag.startswith(("xstart_", "xstop_", "ystart_", "ystop_")):

                 boundary_vars.append(symbol)

         cursor = 0

         for assignment in node.walk(Assignment):

             if assignment.lhs.symbol in boundary_vars:

                 node.children.insert(cursor, assignment.detach())

                 cursor += 1


         # Create block of code to execute only the first time:

         setup_block = IfBlock.create(Reference(first), [])

         setup_block.preceding_comment = \

             "Initialise OpenCL runtime, kernels and buffers"

         node.children.insert(cursor, setup_block)

         setup_block.if_body.addchild(Call.create(psy_init, []))


         # Set up cmd_queues pointer

         ptree = Fortran2003.Pointer_Assignment_Stmt(

             f"{qlist.name} => {get_cmd_queues.name}()")

         cblock = CodeBlock([ptree], CodeBlock.Structure.STATEMENT)

         setup_block.if_body.addchild(cblock)


         # Declare and assign kernel pointers

         for kern in node.coded_kernels():

             name = "kernel_" + kern.name

             try:

                 kpointer = node.symbol_table.lookup_with_tag(name)

             except KeyError:

                 pointer_type = UnsupportedFortranType(

                     "INTEGER(KIND=c_intptr_t), TARGET, SAVE :: " + name)

                 kpointer = DataSymbol(name, datatype=pointer_type)

                 node.symbol_table.add(kpointer, tag=name)

             setup_block.if_body.addchild(

                 Assignment.create(

                     Reference(kpointer),

                     Call.create(get_kernel_by_name,

                                 [Literal(kern.name, CHARACTER_TYPE)])))


         # Traverse all arguments and make sure all the buffers are initialised

         initialised_fields = set()

         there_is_a_grid_buffer = False

         for kern in node.coded_kernels():

             for arg in kern.arguments.args:

                 if arg.argument_type == "field":

                     field = node.symbol_table.lookup(arg.name)

                     if field not in initialised_fields:

                         # Call the init_buffer routine with this field

                         call = Call.create(init_buf, [Reference(field)])

                         setup_block.if_body.addchild(call)

                         initialised_fields.add(field)

                 elif (arg.argument_type == "grid_property" and

                       not arg.is_scalar):

                     if not there_is_a_grid_buffer:

                         # Call the grid init_buffer routine

                         field = node.symbol_table.lookup(

                                 kern.arguments.find_grid_access().name)

                         call = Call.create(init_grid, [Reference(field)])

                         setup_block.if_body.addchild(call)

                         there_is_a_grid_buffer = True

                 if not arg.is_scalar:

                     # All buffers will be assigned to a local OpenCL memory

                     # object to easily reference them, make sure this local

                     # variable is declared in the Invoke.

                     name = arg.name + "_cl_mem"

                     try:

                         node.symbol_table.lookup_with_tag(name)

                     except KeyError:

                         node.symbol_table.new_symbol(

                             name, tag=name, symbol_type=DataSymbol,

                             datatype=UnsupportedFortranType(

                                 "INTEGER(KIND=c_intptr_t) :: " + name))


         # Now call all the set_args routines because in some platforms (e.g.

         # in Xilinx FPGA) knowing which arguments each kernel is going to use

         # allows the write operation to place the data into the appropriate

         # memory bank.

         first_statement_comment = False

         kernel_names = set()

         for kern in node.coded_kernels():

             if kern.name not in kernel_names:

                 kernel_names.add(kern.name)

                 callblock = self._generate_set_args_call_generate_set_args_call(kern, node.scope)

                 for child in callblock.pop_all_children():

                     setup_block.if_body.addchild(child)

                     if not first_statement_comment:

                         child.preceding_comment = (

                             "Do a set_args now so subsequent writes place the "

                             "data appropriately")

                         first_statement_comment = True


         # Now we can insert calls to write_to_device method for each buffer

         # and the grid writing call if there is one (in a new first time block)

         first_statement_comment = False

         for field in initialised_fields:

             call = Call.create(

                 RoutineSymbol(field.name+"%write_to_device"), [])

             setup_block.if_body.addchild(call)

             if not first_statement_comment:

                 call.preceding_comment = "Write data to the device"

                 first_statement_comment = True


         if there_is_a_grid_buffer:

             fieldarg = node.coded_kernels()[0].arguments.find_grid_access()

             field = node.symbol_table.lookup(fieldarg.name)

             call = Call.create(write_grid_buf, [Reference(field)])

             setup_block.if_body.addchild(call)


         # We will just mark the nodes we are replacing as deleting them inside

         # the loop would break the PSy-layer backward_dependency method in the

         # following iterations. We will detach all these nodes after the loop.

         nodes_to_detach = []


         # Transform each kernel call loop construct to its equivalent FortCL

         # statements

         for kern in node.coded_kernels():

             outerloop = kern.ancestor(GOLoop).ancestor(GOLoop)


             # Set up globalsize and localsize arrays

             garg = node.coded_kernels()[0].arguments.find_grid_access()

             num_x = api_config.grid_properties["go_grid_nx"].fortran\

                 .format(garg.name)

             num_y = api_config.grid_properties["go_grid_ny"].fortran\

                 .format(garg.name)

             assig = Assignment.create(

                     Reference(global_size),

                     Literal(f"(/{num_x}, {num_y}/)",

                             ArrayType(INTEGER_TYPE, [2])))

             node.children.insert(outerloop.position, assig)

             local_size_value = kern.opencl_options['local_size']

             assig = Assignment.create(

                     Reference(local_size),

                     Literal(f"(/{local_size_value}, 1/)",

                             ArrayType(INTEGER_TYPE, [2])))

             node.children.insert(outerloop.position, assig)


             # Check that the global_size is multiple of the local_size

             if api_config.debug_mode:

                 fortran_reader = FortranReader()

                 global_size_expr = fortran_reader.psyir_from_expression(

                         num_x, node.symbol_table)

                 self._add_divisibility_check_add_divisibility_check(node, outerloop.position,

                                              check_status, global_size_expr,

                                              local_size_value)


             # Retrieve kernel symbol

             kernelsym = node.symbol_table.lookup_with_tag(

                             "kernel_" + kern.name)


             # Choose the command queue number to which to dispatch this kernel.

             # We have do deal with possible dependencies to kernels dispatched

             # in different command queues as the order of execution is not

             # guaranteed.

             queue_number = kern.opencl_options['queue_number']

             cmd_queue = ArrayReference.create(

                     qlist, [Literal(str(queue_number), INTEGER_TYPE)])

             dependency = outerloop.backward_dependence()


             # If the dependency is a loop containing a kernel, add a barrier if

             # the previous kernels were dispatched in a different command queue

             if dependency:

                 for kernel_dep in dependency.coded_kernels():

                     previous_queue = kernel_dep.opencl_options['queue_number']

                     if previous_queue != queue_number:

                         # If the backward dependency is being executed in

                         # another queue we add a barrier to make sure the

                         # previous kernel has finished before this halo

                         # exchange starts.

                         barrier = Assignment.create(

                                     Reference(flag),

                                     Call.create(cl_finish, [

                                         ArrayReference.create(qlist, [

                                             Literal(str(previous_queue),

                                                     INTEGER_TYPE)])]))

                         node.children.insert(outerloop.position, barrier)


             # If the dependency is something other than a kernel, currently we

             # dispatch everything else to queue _OCL_MANAGEMENT_QUEUE, so add a

             # barrier if this kernel is not on queue _OCL_MANAGEMENT_QUEUE.

             if dependency and not dependency.coded_kernels() and \

                     queue_number != self._OCL_MANAGEMENT_QUEUE_OCL_MANAGEMENT_QUEUE:

                 barrier = Assignment.create(

                             Reference(flag),

                             Call.create(cl_finish, [

                                 ArrayReference.create(qlist, [

                                     Literal(str(self._OCL_MANAGEMENT_QUEUE_OCL_MANAGEMENT_QUEUE),

                                             INTEGER_TYPE)])]))

                 node.children.insert(outerloop.position, barrier)


             # Check that everything has succeeded before the kernel launch

             if api_config.debug_mode:

                 self._add_ready_check_add_ready_check(node, outerloop.position, check_status,

                                       kern.name, flag, cl_finish,

                                       cmd_queue.copy())

             callblock = self._generate_set_args_call_generate_set_args_call(kern, node.scope)

             for child in callblock.pop_all_children():

                 node.children.insert(outerloop.position, child)


             # Then we call the clEnqueueNDRangeKernel

             assig = Assignment.create(

                         Reference(flag),

                         Call.create(cl_launch, [

                             # OpenCL Command Queue

                             cmd_queue,

                             # OpenCL Kernel object

                             Reference(kernelsym),

                             # Number of work dimensions

                             Literal("2", INTEGER_TYPE),

                             # Global offset (if NULL the global IDs start at

                             # offset (0,0,0))

                             Reference(c_null),

                             # Global work size

                             Call.create(c_loc, [Reference(global_size)]),

                             # Local work size

                             Call.create(c_loc, [Reference(local_size)]),

                             # Number of events in wait list

                             Literal("0", INTEGER_TYPE),

                             # Event wait list that need to be completed before

                             # this kernel

                             Reference(c_null),

                             # Event that identifies this kernel completion

                             Reference(c_null)]))

             assig.preceding_comment = "Launch the kernel"

             node.children.insert(outerloop.position, assig)

             self._insert_kernel_code_in_opencl_file_insert_kernel_code_in_opencl_file(kern)


             # Add additional checks if we are in debug mode

             if api_config.debug_mode:

                 self._add_kernel_check_add_kernel_check(node, outerloop.position, check_status,

                                        kern.name, flag, cl_finish,

                                        cmd_queue.copy())


             nodes_to_detach.append(outerloop)


         # If we execute the kernels asynchronously, we need to add wait

         # statements before the halo exchanges to guarantee that the data

         # has been updated

         for possible_dependent_node in node.walk(HaloExchange):

             # The backward_dependences returns the last Loop with a kernel

             # that has a dependency with this halo exchange

             dependency = possible_dependent_node.backward_dependence()

             if dependency:

                 for kernel_dep in dependency.coded_kernels():

                     previous_queue = kernel_dep.opencl_options['queue_number']

                     if previous_queue != self._OCL_MANAGEMENT_QUEUE_OCL_MANAGEMENT_QUEUE:

                         # If the backward dependency is being executed in

                         # another queue we add a barrier to make sure the

                         # previous kernel has finished before this one starts.

                         barrier = Assignment.create(

                                     Reference(flag),

                                     Call.create(cl_finish, [

                                         ArrayReference.create(qlist, [

                                             Literal(str(previous_queue),

                                                     INTEGER_TYPE)])]))

                         pos = possible_dependent_node.position

                         node.children.insert(pos, barrier)


         for node_to_detach in nodes_to_detach:

             node_to_detach.detach()


         if end_barrier:

             self._add_end_barrier_add_end_barrier(node, flag, cl_finish, qlist)


         # And at the very end always makes sure that first_time value is False

         assign = Assignment.create(Reference(first),

                                    Literal("false", BOOLEAN_TYPE))

         assign.preceding_comment = "Unset the first time flag"

         node.addchild(assign)


         self._output_opencl_kernels_file_output_opencl_kernels_file()


     def _add_end_barrier(self, node, flag, cl_finish, qlist):

         ''' Append into the given node a OpenCL Wait operation for each of

         the OpenCL queues in use.


         :param node: PSyIR node where to append the barrier.

         :type node: :py:class:`psyclone.psyir.nodes.Schedule`

         :param flag: PSyIR symbol to use as flag.

         :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol`

         :param cl_finish: PSyIR symbol of the barrier routine.

         :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol`

         :param qlist: PSyIR symbol of the OpenCL queues array.

         :type qlist: :py:class:`psyclone.psyir.symbols.DataSymbol`


         '''

         # We need a clFinish for each of the queues in the implementation

         added_comment = False

         for num in range(1, self._max_queue_number_max_queue_number_max_queue_number + 1):

             queue = ArrayReference.create(qlist, [Literal(str(num),

                                                   INTEGER_TYPE)])

             node.addchild(

                 Assignment.create(

                     Reference(flag), Call.create(cl_finish, [queue])))

             if not added_comment:

                 node.children[-1].preceding_comment = \

                     "Wait until all kernels have finished"

                 added_comment = True


     @staticmethod

     def _add_divisibility_check(node, position, check_status, global_size_expr,

                                 local_size):

         ''' Insert into node a check that the global_size is exactly

         divisible by the local size.


         :param node: where to insert the divisibility check.

         :type node: :py:class:`psyclone.psyir.nodes.Schedule`

         :param int position: location where to insert the divisibilitay check.

         :param check_status: PSyIR symbol of the check routine.

         :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol`

         :param global_size_expr: PSyIR representing the global_size.

         :type global_size_expr: :py:class:`psyclone.psyir.nodes.DataNode`

         :param int local_size: size of the OpenCL local work_group.


         '''

         check = BinaryOperation.create(

                     BinaryOperation.Operator.NE,

                     IntrinsicCall.create(

                         IntrinsicCall.Intrinsic.MOD,

                         [global_size_expr,

                          Literal(str(local_size), INTEGER_TYPE)]

                         ),

                     Literal("0", INTEGER_TYPE))

         message = ("Global size is not a multiple of local size ("

                    "mandatory in OpenCL < 2.0).")

         error = Call.create(check_status,

                             [Literal(message, CHARACTER_TYPE),

                              Literal("-1", INTEGER_TYPE)])

         ifblock = IfBlock.create(check, [error])

         node.children.insert(position, ifblock)


     @staticmethod

     def _add_kernel_check(node, position, check_status, kernel_name,

                           flag, cl_finish, cmd_queue):

         ''' Insert into node a check that the kernel has been launched and

         has been executed successfully.


         :param node: where to insert the kernel check.

         :type node: :py:class:`psyclone.psyir.nodes.Schedule`

         :param int position: location where to insert the kernel check.

         :param check_status: PSyIR symbol of the check routine.

         :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol`

         :param str kernel_name: name of the kernel being checked.

         :param flag: PSyIR symbol to use as flag.

         :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol`

         :param cl_finish: PSyIR symbol of the barrier routine.

         :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol`

         :param cmd_queue: PSyIR symbol of the OpenCL command queues array.

         :type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol`


         '''

         # First check the launch return value

         message = Literal(f"{kernel_name} clEnqueueNDRangeKernel",

                           CHARACTER_TYPE)

         check = Call.create(check_status, [message, Reference(flag)])

         node.children.insert(position, check)


         # Then add a barrier

         barrier = Assignment.create(

                     Reference(flag),

                     Call.create(cl_finish, [cmd_queue]))

         node.children.insert(position + 1, barrier)


         # And check the kernel executed successfully

         message = Literal(f"Errors during {kernel_name}", CHARACTER_TYPE)

         check = Call.create(check_status, [message, Reference(flag)])

         node.children.insert(position + 2, check)


     @staticmethod

     def _add_ready_check(node, position, check_status, kernel_name,

                          flag, cl_finish, cmd_queue):

         ''' Insert into node a check that verifies if everything in the

         command queues previous to a kernel launch has completed successfully.


         :param node: where to insert the kernel check.

         :type node: :py:class:`psyclone.psyir.nodes.Schedule`

         :param int position: location where to insert the kernel check.

         :param check_status: PSyIR symbol of the check routine.

         :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol`

         :param str kernel_name: name of the kernel being checked.

         :param flag: PSyIR symbol to use as flag.

         :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol`

         :param cl_finish: PSyIR symbol of the barrier routine.

         :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol`

         :param cmd_queue: PSyIR symbol of the OpenCL command queues array.

         :type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol`


         '''

         barrier = Assignment.create(

                     Reference(flag),

                     Call.create(cl_finish, [cmd_queue]))

         node.children.insert(position, barrier)

         message = Literal(f"Errors before {kernel_name} launch",

                           CHARACTER_TYPE)

         check = Call.create(check_status, [message, Reference(flag)])

         node.children.insert(position + 1, check)


     def _insert_kernel_code_in_opencl_file(self, kernel):

         ''' Insert the given kernel into a OpenCL file. For this we need

         to remove the 'go_wp' precision symbol which can't be generated

         by OpenCL. We assume 'go_wp' is a OpenCL double.


         :param kernel: the kernel to insert.

         :type kernel: :py:class:`psyclone.psyir.nodes.KernelSchedule`


         '''

         if not self._kernels_file_kernels_file:

             self._kernels_file_kernels_file = FileContainer("opencl_kernels")


         # Create a copy of the kernel and remove precision symbols since they

         # are not supported in the OpenCL backend.

         kernel_copy = kernel.get_kernel_schedule().copy()

         symtab = kernel_copy.symbol_table


         # TODO #898: Removing symbols is not properly supported by PSyIR

         # because we have to deal with all references to it. In this case we

         # implement manually a conversion of all 'go_wp' to a double precision

         # and remove the symbol because we guarantee that it just appear in the

         # declarations of other symbols (symtab.datasymbols).

         # pylint: disable=protected-access

         for sym in symtab.datasymbols:

             # Not all types have the 'precision' attribute (e.g.

             # UnresolvedType)

             if (hasattr(sym.datatype, "precision") and

                     isinstance(sym.datatype.precision, DataSymbol)):

                 sym.datatype._precision = ScalarType.Precision.DOUBLE


         if 'go_wp' in symtab:

             del symtab._symbols['go_wp']


         # Insert kernel in the OpenCL kernels file if it doesn't already exist

         for routine in self._kernels_file_kernels_file.walk(Routine):

             if routine.name == kernel.name:

                 break  # if it exist re-use existing one

                 # TODO 1572: Here we assume that in the same Invoke (scope) a

                 # kernel with the same name will be the same kernel, but that

                 # may not be true when doing multiple invokes.

         else:

             self._kernels_file_kernels_file.addchild(kernel_copy)


     def _output_opencl_kernels_file(self):

         ''' Write the OpenCL kernels to a file using the OpenCL backend.


         '''

         # TODO 1013: The code below duplicates some logic of the CodedKern

         # rename_and_write method. Ideally this should be moved out of

         # the AST and transformations and put into some kind of IOManager.


         ocl_writer = OpenCLWriter(kernels_local_size=64)

         new_kern_code = ocl_writer(self._kernels_file_kernels_file)


         fdesc = None

         name_idx = -1

         while not fdesc:

             name_idx += 1

             new_name = f"opencl_kernels_{name_idx}.cl"


             try:

                 # Atomically attempt to open the new kernel file (in case

                 # this is part of a parallel build)

                 fdesc = os.open(

                     os.path.join(Config.get().kernel_output_dir, new_name),

                     os.O_CREAT | os.O_WRONLY | os.O_EXCL)

             except (OSError, IOError):

                 # The os.O_CREATE and os.O_EXCL flags in combination mean

                 # that open() raises an error if the file exists

                 continue


         # Write the modified AST out to file

         os.write(fdesc, new_kern_code.encode())

         # Close the new kernel file

         os.close(fdesc)


     @staticmethod

     def _generate_set_args_call(kernel, scope):

         '''

         Generate the Call statement to the set_args subroutine for the

         provided kernel.


         :param kernel: the kernel for which to generate a call to its \

             arg_setter subroutine.

         :type kernel: :py:class:`psyclone.psyGen.CodedKern`

         :param scope: The node representing the scope where the call \

             statements will be inserted.

         :type scope: :py:class:`psyclone.psyir.nodes.ScopingNode`


         :returns: a block of statements that represent the set_args call

         :rtype: :py:class:`psyclone.psyir.nodes.Schedule`


         '''

         call_block = Schedule()


         # Retrieve symbol table and kernel symbol

         symtab = scope.symbol_table

         kernelsym = symtab.lookup_with_tag("kernel_" + kernel.name)


         # Find the symbol that defines each boundary for this kernel.

         # In OpenCL the iteration boundaries are passed as arguments to the

         # kernel because the global work size may exceed the dimensions and

         # therefore the updates outside the boundaries should be masked.

         # If any of the boundaries is not found, it can not proceed.

         boundaries = []

         try:

             for boundary in ["xstart", "xstop", "ystart", "ystop"]:

                 tag = boundary + "_" + kernel.name

                 symbol = symtab.lookup_with_tag(tag)

                 boundaries.append(symbol.name)

         except KeyError as err:

             raise GenerationError(

                 f"Boundary symbol tag '{tag}' not found while generating the "

                 f"OpenCL code for kernel '{kernel.name}'. Make sure to apply "

                 f"the GOMoveIterationBoundariesInsideKernelTrans before "

                 f"attempting the OpenCL code generation.") from err


         api_config = Config.get().api_conf("gocean1.0")

         # Prepare the argument list for the set_args routine

         arguments = [Reference(kernelsym)]

         for arg in kernel.arguments.args:

             if arg.argument_type == "scalar":

                 if arg.name in boundaries:

                     # Boundary values are 0-indexed in OpenCL and 1-indexed in

                     # PSyIR, therefore we need to subtract 1

                     bop = BinaryOperation.create(BinaryOperation.Operator.SUB,

                                                  arg.psyir_expression(),

                                                  Literal("1", INTEGER_TYPE))

                     arguments.append(bop)

                 else:

                     arguments.append(arg.psyir_expression())

             elif arg.argument_type == "field":

                 # Cast buffer to cl_mem type expected by OpenCL

                 field = symtab.lookup(arg.name)

                 symbol = symtab.lookup_with_tag(arg.name + "_cl_mem")

                 source = StructureReference.create(field, ['device_ptr'])

                 dest = Reference(symbol)

                 icall = IntrinsicCall.create(IntrinsicCall.Intrinsic.TRANSFER,

                                              [source, dest])

                 assig = Assignment.create(dest.copy(), icall)

                 call_block.addchild(assig)

                 arguments.append(Reference(symbol))

             elif arg.argument_type == "grid_property":

                 garg = kernel.arguments.find_grid_access()

                 if arg.is_scalar:

                     # pylint: disable=protected-access

                     arguments.append(

                         StructureReference.create(

                             symtab.lookup(garg.name),

                             api_config.grid_properties[arg._property_name]

                             .fortran.split('%')[1:]

                         ))

                 else:

                     # Cast grid buffer to cl_mem type expected by OpenCL

                     device_grid_property = arg.name + "_device"

                     field = symtab.lookup(garg.name)

                     source = StructureReference.create(

                                 field, ['grid', device_grid_property])

                     symbol = symtab.lookup_with_tag(arg.name + "_cl_mem")

                     dest = Reference(symbol)

                     icall = IntrinsicCall.create(

                         IntrinsicCall.Intrinsic.TRANSFER,

                         [source, dest])

                     assig = Assignment.create(dest.copy(), icall)

                     call_block.addchild(assig)

                     arguments.append(Reference(symbol))


         call_symbol = symtab.lookup_with_tag(kernel.name + "_set_args")

         call_block.addchild(Call.create(call_symbol, arguments))

         return call_block


     @staticmethod

     def _insert_ocl_arg_setter_routine(node, kernel):

         '''

         Returns the symbol of the subroutine that sets the OpenCL kernel

         arguments for the provided PSy-layer kernel using FortCL. If the

         subroutine doesn't exist it also generates it.


         :param node: the container where the new subroutine will be inserted.

         :type node: :py:class:`psyclone.psyir.nodes.Container`

         :param kernel: the kernel call for which to provide the arg_setter \

                        subroutine.

         :type kernel: :py:class:`psyclone.psyGen.CodedKern`


         :returns: the symbol representing the arg_setter subroutine.

         :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`


         '''

         # Check if the subroutine already exist.

         sub_name = kernel.name + "_set_args"

         try:

             return node.symbol_table.lookup_with_tag(sub_name)

         except KeyError:

             # If the Symbol does not exist, the rest of this method

             # will generate it.

             pass


         # Create the new Routine and RoutineSymbol

         node.symbol_table.add(RoutineSymbol(sub_name), tag=sub_name)

         argsetter = Routine(sub_name)

         arg_list = []


         # Add subroutine imported symbols

         clfortran = ContainerSymbol("clfortran")

         clsetkernelarg = RoutineSymbol("clSetKernelArg",

                                        interface=ImportInterface(clfortran))

         iso_c = ContainerSymbol("iso_c_binding")

         c_sizeof = RoutineSymbol("C_SIZEOF", interface=ImportInterface(iso_c))

         c_loc = RoutineSymbol("C_LOC", interface=ImportInterface(iso_c))

         c_intptr_t = RoutineSymbol("c_intptr_t",

                                    interface=ImportInterface(iso_c))

         ocl_utils = ContainerSymbol("ocl_utils_mod")

         check_status = RoutineSymbol("check_status",

                                      interface=ImportInterface(ocl_utils))

         argsetter.symbol_table.add(clfortran)

         argsetter.symbol_table.add(clsetkernelarg)

         argsetter.symbol_table.add(iso_c)

         argsetter.symbol_table.add(c_sizeof)

         argsetter.symbol_table.add(c_loc)

         argsetter.symbol_table.add(c_intptr_t)

         argsetter.symbol_table.add(ocl_utils)

         argsetter.symbol_table.add(check_status)


         # Add an argument symbol for the kernel object

         kobj = argsetter.symbol_table.new_symbol(

             "kernel_obj", symbol_type=DataSymbol,

             interface=ArgumentInterface(ArgumentInterface.Access.READ),

             datatype=UnsupportedFortranType(

                 "INTEGER(KIND=c_intptr_t), TARGET :: kernel_obj"))

         arg_list.append(kobj)


         # Include each kernel call argument as an argument of this routine

         for arg in kernel.arguments.args:


             name = argsetter.symbol_table.next_available_name(arg.name)


             # This function requires 'TARGET' annotated declarations which are

             # not supported in the PSyIR, so we build them as

             # UnsupportedFortranType for now.

             if arg.is_scalar and arg.intrinsic_type == "real":

                 pointer_type = UnsupportedFortranType(

                     "REAL(KIND=go_wp), INTENT(IN), TARGET :: " + name)

             elif arg.is_scalar:

                 pointer_type = UnsupportedFortranType(

                     "INTEGER, INTENT(IN), TARGET :: " + name)

             else:

                 # Everything else is a cl_mem pointer (c_intptr_t)

                 pointer_type = UnsupportedFortranType(

                     "INTEGER(KIND=c_intptr_t), INTENT(IN), TARGET :: " + name)


             new_arg = DataSymbol(

                 name, datatype=pointer_type,

                 interface=ArgumentInterface(ArgumentInterface.Access.READ))

             argsetter.symbol_table.add(new_arg)

             arg_list.append(new_arg)


         argsetter.symbol_table.specify_argument_list(arg_list)


         # Create the ierr local variable

         ierr = argsetter.symbol_table.new_symbol(

             "ierr", symbol_type=DataSymbol, datatype=INTEGER_TYPE)


         # Call the clSetKernelArg for each argument and a check_status to

         # see if the OpenCL call has succeeded

         for index, variable in enumerate(arg_list[1:]):

             call = Call.create(clsetkernelarg,

                                [Reference(kobj),

                                 Literal(str(index), INTEGER_TYPE),

                                 Call.create(c_sizeof, [Reference(variable)]),

                                 Call.create(c_loc, [Reference(variable)])])

             assignment = Assignment.create(Reference(ierr), call)

             argsetter.addchild(assignment)

             emsg = f"clSetKernelArg: arg {index} of {kernel.name}"

             call = Call.create(check_status, [Literal(emsg, CHARACTER_TYPE),

                                               Reference(ierr)])

             argsetter.addchild(call)


         argsetter.children[0].preceding_comment = \

             f"Set the arguments for the {kernel.name} OpenCL Kernel"


         # Add the subroutine as child of the provided node

         node.addchild(argsetter)


         return node.symbol_table.lookup_with_tag(sub_name)


     def _insert_opencl_init_routine(self, node):

         '''

         Returns the symbol of the subroutine that initialises the OpenCL

         environment using FortCL. If the subroutine doesn't exist it also

         generates it.


         :param node: the container where the new subroutine will be inserted.

         :type node: :py:class:`psyclone.psyir.nodes.Container`


         :returns: the symbol representing the OpenCL initialisation subroutine.

         :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`


         '''

         symtab = node.symbol_table

         try:

             # TODO #1572: The ocl_init routine may need to be regenerated if

             # there are multiple Invokes because _max_queue_number may have

             # increased and we need to load the kernels of both invokes.

             return symtab.lookup_with_tag("ocl_init_routine")

         except KeyError:

             # If the Symbol does not exist, the rest of this method

             # will generate it.

             pass


         # Create the symbol for the routine and add it to the symbol table.

         subroutine_name = symtab.new_symbol("psy_init",

                                             symbol_type=RoutineSymbol,

                                             tag="ocl_init_routine").name


         # Choose a round-robin device number if it has MPI and multiple

         # accelerators.

         distributed_memory = Config.get().distributed_memory

         devices_per_node = Config.get().ocl_devices_per_node

         additional_uses = ""

         additional_stmts = ""

         if devices_per_node > 1 and distributed_memory:

             additional_uses += "USE parallel_mod, ONLY: get_rank"

             additional_stmts += \

                 f"ocl_device_num = mod(get_rank()-1, {devices_per_node}) + 1"


         # Get a set of all kernel names in the Container. This implementation

         # currently assumes all of them will be available in OpenCL

         unique_kernels = {kernel.name for kernel in node.coded_kernels()}


         # Code of the subroutine in Fortran

         code = f'''

         subroutine psy_init()

           {additional_uses}

           use fortcl, only: ocl_env_init, add_kernels

           character(len=30) kernel_names({len(unique_kernels)})

           integer :: ocl_device_num=1

           logical, save :: initialised=.false.

           ! Check to make sure we only execute this routine once

           if (.not. initialised) then

             initialised = .true.

             ! Initialise the opencl environment/device

             {additional_stmts}

             call ocl_env_init({self._max_queue_number}, ocl_device_num, &

                 {".true." if self._enable_profiling else ".false."}, &

                 {".true." if self._out_of_order else ".false."})

             ! The kernels this psy layer module requires

         '''


         for index, kernel_name in enumerate(unique_kernels):

             code += f"kernel_names({index + 1}) = \"{kernel_name}\"\n"


         code += f'''\

             ! Create the opencl kernel objects. This expects to find all of

             ! the compiled kernels in FORTCL_KERNELS_FILE environment variable

             call add_kernels({len(unique_kernels)}, kernel_names)

           end if

         end subroutine psy_init'''


         # Obtain the PSyIR representation of the code above

         fortran_reader = FortranReader()

         container = fortran_reader.psyir_from_source(code)

         subroutine = container.children[0]

         # Rename subroutine

         subroutine.name = subroutine_name


         # Add the subroutine as child of the provided node

         node.addchild(subroutine.detach())


         return symtab.lookup_with_tag("ocl_init_routine")


     @staticmethod

     def _insert_initialise_grid_buffers(node):

         '''

         Returns the symbol of a subroutine that initialises all OpenCL grid

         buffers in the OpenCL device using FortCL. If the subroutine doesn't

         already exist it also generates it.


         :param node: the container where the new subroutine will be inserted.

         :type node: :py:class:`psyclone.psyir.nodes.Container`


         :returns: the symbol of the grid buffer initialisation subroutine.

         :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`


         '''

         # pylint: disable=too-many-locals

         symtab = node.symbol_table

         try:

             return symtab.lookup_with_tag("ocl_init_grid_buffers")

         except KeyError:

             # If the Symbol does not exist, the rest of this method

             # will generate it.

             pass


         # Create the symbol for the routine and add it to the symbol table.

         subroutine_name = symtab.new_symbol("initialise_grid_device_buffers",

                                             symbol_type=RoutineSymbol,

                                             tag="ocl_init_grid_buffers").name


         # Get the GOcean API property names used in this routine

         api_config = Config.get().api_conf("gocean1.0")

         props = api_config.grid_properties

         num_x = props["go_grid_nx"].fortran.format("field")

         num_y = props["go_grid_ny"].fortran.format("field")


         int_arrays = []

         real_arrays = []

         for key, prop in props.items():

             if key == "go_grid_data":

                 # TODO #676: Ignore because go_grid_data is actually a field

                 # property

                 continue

             if prop.type == "array" and prop.intrinsic_type == "integer":

                 int_arrays.append(prop.fortran.format("field"))

             elif prop.type == "array" and prop.intrinsic_type == "real":

                 real_arrays.append(prop.fortran.format("field"))


         # Code of the subroutine in Fortran

         code = f'''

         subroutine initialise_device_grid(field)

             USE fortcl, ONLY: create_ronly_buffer

             USE iso_c_binding, only: c_size_t

             use field_mod

             type(r2d_field), intent(inout), target :: field

             integer(kind=c_size_t) size_in_bytes

             IF (.not. c_associated({int_arrays[0]}_device)) THEN

                 ! Create integer grid fields

                 size_in_bytes = int({num_x}*{num_y}, 8) * &

                     c_sizeof({int_arrays[0]}(1,1))

         '''


         for int_array in int_arrays:

             code += f'''

                 {int_array}_device = transfer( &

                     create_ronly_buffer(size_in_bytes), {int_array}_device)

             '''


         code += f'''

                 ! Create real grid buffers

                 size_in_bytes = int({num_x} * {num_y}, 8) * &

                                     c_sizeof({real_arrays[0]}(1,1))

         '''


         for real_array in real_arrays:

             code += f'''

                 {real_array}_device = transfer( &

                     create_ronly_buffer(size_in_bytes), {real_array}_device)

             '''


         code += '''

             END IF

         end subroutine initialise_device_grid

         '''


         # Obtain the PSyIR representation of the code above

         fortran_reader = FortranReader()

         container = fortran_reader.psyir_from_source(code)

         subroutine = container.children[0]

         # Rename subroutine

         subroutine.name = subroutine_name


         # Add the subroutine as child of the provided node

         node.addchild(subroutine.detach())


         return symtab.lookup_with_tag("ocl_init_grid_buffers")


     def _insert_write_grid_buffers(self, node):

         '''

         Returns the symbol of a subroutine that writes the values of the grid

         properties into the OpenCL device buffers using FortCL. If the

         subroutine doesn't already exist it also generates it.


         :param node: the container where the new subroutine will be inserted.

         :type node: :py:class:`psyclone.psyir.nodes.Container`


         :returns: the symbol representing the grid buffers writing subroutine.

         :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`


         '''

         # pylint: disable=too-many-locals

         symtab = node.symbol_table

         try:

             return symtab.lookup_with_tag("ocl_write_grid_buffers")

         except KeyError:

             # If the Symbol does not exist, the rest of this method

             # will generate it.

             pass


         # Create the symbol for the routine and add it to the symbol table.

         subroutine_name = symtab.new_symbol("write_grid_buffers",

                                             symbol_type=RoutineSymbol,

                                             tag="ocl_write_grid_buffers").name


         # Get the GOcean API property names used in this routine

         api_config = Config.get().api_conf("gocean1.0")

         props = api_config.grid_properties

         num_x = props["go_grid_nx"].fortran.format("field")

         num_y = props["go_grid_ny"].fortran.format("field")


         # Code of the subroutine in Fortran

         code = f'''

         subroutine write_device_grid(field)

             USE fortcl, ONLY: get_cmd_queues

             use iso_c_binding, only: c_intptr_t, c_size_t, c_sizeof

             USE clfortran

             USE ocl_utils_mod, ONLY: check_status

             type(r2d_field), intent(inout), target :: field

             integer(kind=c_size_t) size_in_bytes

             INTEGER(c_intptr_t), pointer :: cmd_queues(:)

             integer(c_intptr_t) :: cl_mem

             integer :: ierr

             cmd_queues => get_cmd_queues()

             ! Integer grid buffers

             size_in_bytes = int({num_x} * {num_y}, 8) * &

                             c_sizeof(field%grid%tmask(1,1))

             cl_mem = transfer(field%grid%tmask_device, cl_mem)

             ierr = clEnqueueWriteBuffer( &

                         cmd_queues({self._OCL_MANAGEMENT_QUEUE}), &

                         cl_mem, CL_TRUE, 0_8, size_in_bytes, &

                         C_LOC(field%grid%tmask), 0, C_NULL_PTR, C_NULL_PTR)

             CALL check_status("clEnqueueWriteBuffer tmask", ierr)

             ! Real grid buffers

             size_in_bytes = int({num_x} * {num_y}, 8) * &

                             c_sizeof(field%grid%area_t(1,1))

         '''

         write_str = '''

             cl_mem = transfer(field%grid%{0}_device, cl_mem)

             ierr = clEnqueueWriteBuffer(cmd_queues({1}), &

                        cl_mem, CL_TRUE, 0_8, size_in_bytes, &

                        C_LOC(field%grid%{0}), 0, C_NULL_PTR, C_NULL_PTR)

             CALL check_status("clEnqueueWriteBuffer {0}_device", ierr)

         '''

         for grid_prop in ['area_t', 'area_u', 'area_v', 'dx_u', 'dx_v',

                           'dx_t', 'dy_u', 'dy_v', 'dy_t', 'gphiu', 'gphiv']:

             code += write_str.format(grid_prop, self._OCL_MANAGEMENT_QUEUE_OCL_MANAGEMENT_QUEUE)

         code += "end subroutine write_device_grid"


         # Obtain the PSyIR representation of the code above

         fortran_reader = FortranReader()

         container = fortran_reader.psyir_from_source(code)

         subroutine = container.children[0]

         # Rename subroutine

         subroutine.name = subroutine_name


         # Add the subroutine as child of the provided node

         node.addchild(subroutine.detach())


         return symtab.lookup_with_tag("ocl_write_grid_buffers")


     def _insert_ocl_read_from_device_function(self, node):

         '''

         Returns the symbol of a subroutine that retrieves the data back from

         an OpenCL device using FortCL. If the subroutine doesn't already exist

         it also generates it.


         :param node: the container where the new subroutine will be inserted.

         :type node: :py:class:`psyclone.psyir.nodes.Container`


         :returns: the symbol of the buffer data retrieving subroutine.

         :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`


         '''

         symtab = node.symbol_table

         try:

             return symtab.lookup_with_tag("ocl_read_func")

         except KeyError:

             # If the subroutines does not exist, it needs to be

             # generated first.

             pass


         # Create the symbol for the routine and add it to the symbol table.

         subroutine_name = symtab.new_symbol("read_from_device",

                                             symbol_type=RoutineSymbol,

                                             tag="ocl_read_func").name


         # Code of the subroutine in Fortran

         code = f'''

         subroutine read_sub(from, to, startx, starty, nx, ny, blocking)

             USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof

             USE ocl_utils_mod, ONLY: check_status

             use kind_params_mod, only: go_wp

             USE clfortran

             USE fortcl, ONLY: get_cmd_queues

             type(c_ptr), intent(in) :: from

             real(go_wp), intent(inout), dimension(:,:), target :: to

             integer, intent(in) :: startx, starty, nx, ny

             logical, intent(in) :: blocking

             INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes

             integer(c_intptr_t) :: cl_mem

             INTEGER(c_intptr_t), pointer :: cmd_queues(:)

             integer :: ierr, i


             ! Give the from pointer the appropriate OpenCL memory object type

             cl_mem = transfer(from, cl_mem)

             cmd_queues => get_cmd_queues()


             ! Two copy strategies depending on how much of the total length

             ! nx covers.

             if (nx < size(to, 1) / 2) then

                 ! Dispatch asynchronous copies of just the contiguous data.

                 do i = starty, starty+ny

                     size_in_bytes = int(nx, 8) * c_sizeof(to(1,1))

                     offset_in_bytes = int(size(to, 1) * (i-1) + (startx-1)) &

                                       * c_sizeof(to(1,1))

                     ierr = clEnqueueReadBuffer( &

                         cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &

                         CL_FALSE, offset_in_bytes, size_in_bytes, &

                         C_LOC(to(startx, i)), 0, C_NULL_PTR, C_NULL_PTR)

                     CALL check_status("clEnqueueReadBuffer", ierr)

                 enddo

                 if (blocking) then

                     CALL check_status("clFinish on read", &

                         clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE})))

                 endif

             else

                 ! Copy across the whole starty:starty+ny rows in a single

                 ! copy operation.

                 size_in_bytes = int(size(to, 1) * ny, 8) * c_sizeof(to(1,1))

                 offset_in_bytes = int(size(to,1)*(starty-1), 8) &

                                   * c_sizeof(to(1,1))

                 ierr = clEnqueueReadBuffer( &

                     cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &

                     CL_TRUE, offset_in_bytes, size_in_bytes, &

                     C_LOC(to(1,starty)), 0, C_NULL_PTR, C_NULL_PTR)

                 CALL check_status("clEnqueueReadBuffer", ierr)

             endif

         end subroutine read_sub

         '''


         # Obtain the PSyIR representation of the code above

         fortran_reader = FortranReader()

         container = fortran_reader.psyir_from_source(code)

         subroutine = container.children[0]


         # Rename subroutine

         subroutine.name = subroutine_name


         # Add the subroutine as child of the provided node

         node.addchild(subroutine.detach())


         return symtab.lookup_with_tag("ocl_read_func")


     def _insert_ocl_write_to_device_function(self, node):

         '''

         Returns the symbol of a subroutine that writes the buffer data into

         an OpenCL device using FortCL. If the subroutine doesn't already exist

         it also generates it.


         :param node: the container where the new subroutine will be inserted.

         :type node: :py:class:`psyclone.psyir.nodes.Container`


         :returns: the symbol of the buffer writing subroutine.

         :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`


         '''

         symtab = node.symbol_table

         try:

             return symtab.lookup_with_tag("ocl_write_func")

         except KeyError:

             # If the subroutines does not exist, it needs to be

             # generated first.

             pass


         # Create the symbol for the routine and add it to the symbol table.

         subroutine_name = symtab.new_symbol("write_to_device",

                                             symbol_type=RoutineSymbol,

                                             tag="ocl_write_func").name


         # Code of the subroutine in Fortran

         code = f'''

         subroutine write_sub(from, to, startx, starty, nx, ny, blocking)

             USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof

             USE ocl_utils_mod, ONLY: check_status

             use kind_params_mod, only: go_wp

             USE clfortran

             USE fortcl, ONLY: get_cmd_queues

             real(go_wp), intent(in), dimension(:,:), target :: from

             type(c_ptr), intent(in) :: to

             integer, intent(in) :: startx, starty, nx, ny

             logical, intent(in) :: blocking

             integer(c_intptr_t) :: cl_mem

             INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes

             INTEGER(c_intptr_t), pointer :: cmd_queues(:)

             integer :: ierr, i


             ! Give the to pointer the appropriate OpenCL memory object type

             cl_mem = transfer(to, cl_mem)

             cmd_queues => get_cmd_queues()


             ! Two copy strategies depending on how much of the total length

             ! nx covers.

             if (nx < size(from,1) / 2) then

                 ! Dispatch asynchronous copies of just the contiguous data.

                 do i=starty, starty+ny

                     size_in_bytes = int(nx, 8) * c_sizeof(from(1,1))

                     offset_in_bytes = int(size(from, 1) * (i-1) + (startx-1)) &

                                       * c_sizeof(from(1,1))

                     ierr = clEnqueueWriteBuffer( &

                         cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &

                         CL_FALSE, offset_in_bytes, size_in_bytes, &

                         C_LOC(from(startx, i)), 0, C_NULL_PTR, C_NULL_PTR)

                     CALL check_status("clEnqueueWriteBuffer", ierr)

                 enddo

                 if (blocking) then

                     CALL check_status("clFinish on write", &

                         clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE})))

                 endif

             else

                 ! Copy across the whole starty:starty+ny rows in a single

                 ! copy operation.

                 size_in_bytes = int(size(from,1) * ny, 8) * c_sizeof(from(1,1))

                 offset_in_bytes = int(size(from,1) * (starty-1)) &

                                   * c_sizeof(from(1,1))

                 ierr = clEnqueueWriteBuffer(&

                     cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &

                     CL_TRUE, offset_in_bytes, size_in_bytes, &

                     C_LOC(from(1, starty)), 0, C_NULL_PTR, C_NULL_PTR)

                 CALL check_status("clEnqueueWriteBuffer", ierr)

             endif

         end subroutine write_sub

         '''


         # Obtain the PSyIR representation of the code above

         fortran_reader = FortranReader()

         container = fortran_reader.psyir_from_source(code)

         subroutine = container.children[0]

         # Rename subroutine

         subroutine.name = subroutine_name


         # Add the subroutine as child of the provided node

         node.addchild(subroutine.detach())


         return symtab.lookup_with_tag("ocl_write_func")


     @staticmethod

     def _insert_ocl_initialise_buffer(node):

         '''

         Returns the symbol of a subroutine that initialises an OpenCL buffer in

         the OpenCL device using FortCL. If the subroutine doesn't already exist

         it also generates it.


         :param node: the container where the new subroutine will be inserted.

         :type node: :py:class:`psyclone.psyir.nodes.Container`

         :returns: the symbol of the buffer initialisation subroutine.

         :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`


         '''

         # pylint: disable=too-many-locals

         symtab = node.symbol_table

         try:

             return symtab.lookup_with_tag("ocl_init_buffer_func")

         except KeyError:

             # If the Symbol does not exist, the rest of this method

             # will generate it.

             pass


         # Create the symbol for the routine and add it to the symbol table.

         subroutine_name = symtab.new_symbol("initialise_device_buffer",

                                             symbol_type=RoutineSymbol,

                                             tag="ocl_init_buffer_func").name


         # Get the GOcean API property names used in this routine

         api_config = Config.get().api_conf("gocean1.0")

         host_buff = \

             api_config.grid_properties["go_grid_data"].fortran.format("field")

         props = api_config.grid_properties

         num_x = props["go_grid_nx"].fortran.format("field")

         num_y = props["go_grid_ny"].fortran.format("field")


         # Fields need to provide a function pointer to how the

         # device data is going to be read and written, if it doesn't

         # exist, create the appropriate subroutine first.

         read_fp = symtab.lookup_with_tag("ocl_read_func").name

         write_fp = symtab.lookup_with_tag("ocl_write_func").name


         # Code of the subroutine in Fortran

         code = f'''

         subroutine initialise_device_buffer(field)

             USE fortcl, ONLY: create_rw_buffer

             USE iso_c_binding, only: c_size_t

             use field_mod

             type(r2d_field), intent(inout), target :: field

             integer(kind=c_size_t) size_in_bytes

             IF (.NOT. field%data_on_device) THEN

                 size_in_bytes = int({num_x} * {num_y}, 8) * &

                                     c_sizeof({host_buff}(1,1))

                 ! Create buffer on device, we store it without type information

                 ! on the dl_esm_inf pointer (transfer/static_cast to void*)

                 field%device_ptr = transfer( &

                     create_rw_buffer(size_in_bytes), &

                     field%device_ptr)

                 field%data_on_device = .true.

                 field%read_from_device_f => {read_fp}

                 field%write_to_device_f => {write_fp}

             END IF

         end subroutine initialise_device_buffer

         '''


         # Obtain the PSyIR representation of the code above

         fortran_reader = FortranReader()

         container = fortran_reader.psyir_from_source(code)

         subroutine = container.children[0]

         # Rename subroutine

         subroutine.name = subroutine_name


         # Add the subroutine as child of the provided node

         node.addchild(subroutine.detach())


         return symtab.lookup_with_tag("ocl_init_buffer_func")


 # For AutoAPI documentation generation

 __all__ = ["GOOpenCLTrans"]

psyclone.domain.common.transformations.kernel_module_inline_trans.KernelModuleInlineTrans
Definition: kernel_module_inline_trans.py:51

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans
Definition: gocean_opencl_trans.py:60

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._add_end_barrier
def _add_end_barrier(self, node, flag, cl_finish, qlist)
Definition: gocean_opencl_trans.py:613

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._add_divisibility_check
def _add_divisibility_check(node, position, check_status, global_size_expr, local_size)
Definition: gocean_opencl_trans.py:642

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._insert_kernel_code_in_opencl_file
def _insert_kernel_code_in_opencl_file(self, kernel)
Definition: gocean_opencl_trans.py:738

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._out_of_order
_out_of_order
Definition: gocean_opencl_trans.py:253

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._out_of_order
bool _out_of_order
Definition: gocean_opencl_trans.py:93

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._insert_write_grid_buffers
def _insert_write_grid_buffers(self, node)
Definition: gocean_opencl_trans.py:1203

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._insert_opencl_init_routine
def _insert_opencl_init_routine(self, node)
Definition: gocean_opencl_trans.py:1023

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._generate_set_args_call
def _generate_set_args_call(kernel, scope)
Definition: gocean_opencl_trans.py:815

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._output_opencl_kernels_file
def _output_opencl_kernels_file(self)
Definition: gocean_opencl_trans.py:781

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._add_ready_check
def _add_ready_check(node, position, check_status, kernel_name, flag, cl_finish, cmd_queue)
Definition: gocean_opencl_trans.py:711

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._insert_initialise_grid_buffers
def _insert_initialise_grid_buffers(node)
Definition: gocean_opencl_trans.py:1109

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans.name
def name(self)
Definition: gocean_opencl_trans.py:100

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._enable_profiling
_enable_profiling
Definition: gocean_opencl_trans.py:250

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._insert_ocl_arg_setter_routine
def _insert_ocl_arg_setter_routine(node, kernel)
Definition: gocean_opencl_trans.py:910

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._enable_profiling
bool _enable_profiling
Definition: gocean_opencl_trans.py:91

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._insert_ocl_write_to_device_function
def _insert_ocl_write_to_device_function(self, node)
Definition: gocean_opencl_trans.py:1379

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._insert_ocl_initialise_buffer
def _insert_ocl_initialise_buffer(node)
Definition: gocean_opencl_trans.py:1472

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._max_queue_number
int _max_queue_number
Definition: gocean_opencl_trans.py:89

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._insert_ocl_read_from_device_function
def _insert_ocl_read_from_device_function(self, node)
Definition: gocean_opencl_trans.py:1286

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._kernels_file
_kernels_file
Definition: gocean_opencl_trans.py:97

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans.validate
def validate(self, node, options=None)
Definition: gocean_opencl_trans.py:107

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._max_queue_number
_max_queue_number
Definition: gocean_opencl_trans.py:262

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans.apply
def apply(self, node, options=None)
Definition: gocean_opencl_trans.py:222

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._OCL_MANAGEMENT_QUEUE
int _OCL_MANAGEMENT_QUEUE
Definition: gocean_opencl_trans.py:80

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._add_kernel_check
def _add_kernel_check(node, position, check_status, kernel_name, flag, cl_finish, cmd_queue)
Definition: gocean_opencl_trans.py:674

psyclone.domain.gocean.transformations.gocean_opencl_trans.GOOpenCLTrans._transformed_invokes
int _transformed_invokes
Definition: gocean_opencl_trans.py:95

psyclone.errors.GenerationError
Definition: errors.py:102

psyclone.psyGen.Transformation
Definition: psyGen.py:2759

psyclone.psyGen.Transformation.validate
def validate(self, node, options=None)
Definition: psyGen.py:2799

psyclone.psyir.backend.opencl.OpenCLWriter
Definition: opencl.py:48

psyclone.psyir.frontend.fortran.FortranReader
Definition: fortran.py:52

psyclone.configuration
Definition: configuration.py:1

psyclone.domain.common.transformations
Definition: __init__.py:1

psyclone.errors
Definition: errors.py:1

psyclone.gocean1p0
Definition: gocean1p0.py:1

psyclone.psyGen
Definition: psyGen.py:1

psyclone.psyir.backend.opencl
Definition: opencl.py:1

psyclone.psyir.frontend.fortran
Definition: fortran.py:1

psyclone.psyir.nodes
Definition: __init__.py:1

psyclone.psyir.symbols
Definition: __init__.py:1

psyclone.transformations
Definition: transformations.py:1