Reference Guide  2.5.0
gocean_opencl_trans.py
1 # -----------------------------------------------------------------------------
2 # BSD 3-Clause License
3 #
4 # Copyright (c) 2021-2024, Science and Technology Facilities Council.
5 # All rights reserved.
6 #
7 # Redistribution and use in source and binary forms, with or without
8 # modification, are permitted provided that the following conditions are met:
9 #
10 # * Redistributions of source code must retain the above copyright notice, this
11 # list of conditions and the following disclaimer.
12 #
13 # * Redistributions in binary form must reproduce the above copyright notice,
14 # this list of conditions and the following disclaimer in the documentation
15 # and/or other materials provided with the distribution.
16 #
17 # * Neither the name of the copyright holder nor the names of its
18 # contributors may be used to endorse or promote products derived from
19 # this software without specific prior written permission.
20 #
21 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
29 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
31 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 # POSSIBILITY OF SUCH DAMAGE.
33 # -----------------------------------------------------------------------------
34 # Authors R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab
35 
36 '''This module contains the GOcean-specific OpenCL transformation.
37 '''
38 
39 import os
40 
41 from fparser.two import Fortran2003
42 from psyclone.configuration import Config
43 from psyclone.domain.common.transformations import KernelModuleInlineTrans
44 from psyclone.errors import GenerationError
45 from psyclone.gocean1p0 import GOInvokeSchedule, GOLoop
46 from psyclone.psyGen import Transformation, args_filter, InvokeSchedule, \
47  HaloExchange
48 from psyclone.psyir.backend.opencl import OpenCLWriter
49 from psyclone.psyir.frontend.fortran import FortranReader
50 from psyclone.psyir.nodes import Routine, Call, Reference, Literal, \
51  Assignment, IfBlock, ArrayReference, Schedule, BinaryOperation, \
52  StructureReference, FileContainer, CodeBlock, IntrinsicCall
53 from psyclone.psyir.symbols import (
54  ArrayType, DataSymbol, RoutineSymbol, ContainerSymbol,
55  UnsupportedFortranType, ArgumentInterface, ImportInterface,
56  INTEGER_TYPE, CHARACTER_TYPE, BOOLEAN_TYPE, ScalarType)
57 from psyclone.transformations import TransformationError
58 
59 
61  '''
62  Switches on/off the generation of an OpenCL PSy layer for a given
63  InvokeSchedule. Additionally, it will generate OpenCL kernels for
64  each of the kernels referenced by the Invoke. For example:
65 
66  >>> from psyclone.parse.algorithm import parse
67  >>> from psyclone.psyGen import PSyFactory
68  >>> API = "gocean1.0"
69  >>> FILENAME = "shallow_alg.f90" # examples/gocean/eg1
70  >>> ast, invoke_info = parse(FILENAME, api=API)
71  >>> psy = PSyFactory(API, distributed_memory=False).create(invoke_info)
72  >>> schedule = psy.invokes.get('invoke_0').schedule
73  >>> ocl_trans = GOOpenCLTrans()
74  >>> ocl_trans.apply(schedule)
75  >>> print(schedule.view())
76 
77  '''
78  # Specify which OpenCL command queue to use for management operations like
79  # data transfers when generating an OpenCL PSy-layer
80  _OCL_MANAGEMENT_QUEUE = 1
81 
82  # TODO #1572: These are class attributes because multiple invokes may need
83  # to generate a single OpenCL environment (e.g. to share the device data
84  # pointers) and therefore guarantee the same properties, but this hasn't
85  # been tested. PSycloneBench ShallowWater could be an example of this.
86 
87  # Biggest queue number that any kernel is allocated to. The OpenCL
88  # environment should be set up with at least this number of queues.
89  _max_queue_number = 1
90  # Whether to enable the profiling option in the OpenCL environment
91  _enable_profiling = False
92  # Whether to enable the out_of_order option in the OpenCL environment
93  _out_of_order = False
94  # Total number of invokes that have been transformed to OpenCL
95  _transformed_invokes = 0
96  # Reference to the OpenCL kernels file
97  _kernels_file = None
98 
99  @property
100  def name(self):
101  '''
102  :returns: the name of this transformation.
103  :rtype: str
104  '''
105  return "GOOpenCLTrans"
106 
107  def validate(self, node, options=None):
108  '''
109  Checks that the supplied InvokeSchedule is valid and that an OpenCL
110  version of it can be generated.
111 
112  :param node: the Schedule to check.
113  :type node: :py:class:`psyclone.psyGen.InvokeSchedule`
114  :param options: a dictionary with options for transformations.
115  :type options: dict of str:value or None
116  :param bool options["enable_profiling"]: whether or not to set up the \
117  OpenCL environment with the profiling option enabled.
118  :param bool options["out_of_order"]: whether or not to set up the \
119  OpenCL environment with the out_of_order option enabled.
120  :param bool options["end_barrier"]: whether or not to add an OpenCL \
121  barrier at the end of the transformed invoke.
122 
123  :raises TransformationError: if the InvokeSchedule is not for the \
124  GOcean1.0 API.
125  :raises TransformationError: if any of the kernels have arguments \
126  which are passed as a literal.
127  :raises TransformationError: if any of the provided options is invalid.
128  :raises TransformationError: if any of the provided options is not \
129  compatible with a previous OpenCL
130  environment.
131  :raises TransformationError: if any kernel in this invoke has a \
132  global variable used by an import.
133  :raises TransformationError: if any kernel does not iterate over \
134  the whole grid.
135  '''
136 
137  if isinstance(node, InvokeSchedule):
138  if not isinstance(node, GOInvokeSchedule):
139  raise TransformationError(
140  f"OpenCL generation is currently only supported for the "
141  f"GOcean API but got an InvokeSchedule of type: "
142  f"'{type(node).__name__}'")
143  else:
144  raise TransformationError(
145  f"Error in GOOpenCLTrans: the supplied node must be a (sub-"
146  f"class of) InvokeSchedule but got {type(node)}")
147 
148  # Validate options map
149  valid_options = ['end_barrier', 'enable_profiling', 'out_of_order']
150  for key, value in options.items():
151  if key in valid_options:
152  # All current options should contain boolean values
153  if not isinstance(value, bool):
154  raise TransformationError(
155  f"InvokeSchedule OpenCL option '{key}' should be a "
156  f"boolean.")
157  else:
158  raise TransformationError(
159  f"InvokeSchedule does not support the OpenCL option "
160  f"'{key}'. The supported options are: {valid_options}.")
161 
162  # Validate that the options are valid with previously generated OpenCL
163  if self._transformed_invokes_transformed_invokes > 0:
164  if ('enable_profiling' in options and
165  self._enable_profiling_enable_profiling_enable_profiling != options['enable_profiling']):
166  raise TransformationError(
167  f"Can't generate an OpenCL Invoke with enable_profiling='"
168  f"{options['enable_profiling']}' since a previous "
169  f"transformation used a different value, and their OpenCL"
170  f" environments must match.")
171 
172  if ('out_of_order' in options and
173  self._out_of_order_out_of_order_out_of_order != options['out_of_order']):
174  raise TransformationError(
175  f"Can't generate an OpenCL Invoke with out_of_order='"
176  f"{options['out_of_order']}' since a previous "
177  f"transformation used a different value, and their OpenCL "
178  f"environments must match.")
179 
180  # Now we need to check that none of the invoke arguments is a literal
181  args = args_filter(node.args, arg_types=["scalar"])
182  for arg in args:
183  if arg.is_literal:
184  raise TransformationError(
185  f"Cannot generate OpenCL for Invokes that contain kernel "
186  f"arguments which are a literal, but found the literal "
187  f"'{arg.name}' used as an argument in invoke "
188  f"'{node.name}'.")
189 
190  # Check that we can construct the PSyIR and SymbolTable of each of
191  # the kernels in this Schedule. Also check that none of them access
192  # any form of global data (that is not a routine argument).
193  for kern in node.kernels():
195  ksched = kern.get_kernel_schedule()
196  global_variables = ksched.symbol_table.imported_symbols
197  if global_variables:
198  raise TransformationError(
199  f"The Symbol Table for kernel '{kern.name}' contains the "
200  f"following symbols with 'global' scope: "
201  f"{[sym.name for sym in global_variables]}. An OpenCL "
202  f"kernel cannot call other kernels and all of the data it "
203  f"accesses must be passed by argument. Use the "
204  f"KernelImportsToArguments transformation to convert such "
205  f"symbols to kernel arguments first.")
206 
207  # In OpenCL all kernel loops should iterate the whole grid
208  for kernel in node.kernels():
209  inner_loop = kernel.ancestor(GOLoop)
210  outer_loop = inner_loop.ancestor(GOLoop)
211  if not (inner_loop.field_space == "go_every" and
212  outer_loop.field_space == "go_every" and
213  inner_loop.iteration_space == "go_all_pts" and
214  outer_loop.iteration_space == "go_all_pts"):
215  raise TransformationError(
216  f"The kernel '{kernel.name}' does not iterate over all "
217  f"grid points. This is a necessary requirement for "
218  f"generating the OpenCL code and can be done by applying "
219  f"the GOMoveIterationBoundariesInsideKernelTrans to each "
220  f"kernel before the GOOpenCLTrans.")
221 
222  def apply(self, node, options=None):
223  '''
224  Apply the OpenCL transformation to the supplied GOInvokeSchedule. This
225  causes PSyclone to generate an OpenCL version of the corresponding
226  PSy-layer routine. The generated code makes use of the FortCL
227  library (https://github.com/stfc/FortCL) in order to manage the
228  OpenCL device directly from Fortran.
229 
230  :param node: the InvokeSchedule to transform.
231  :type node: :py:class:`psyclone.psyGen.GOInvokeSchedule`
232  :param options: set of option to tune the OpenCL generation.
233  :type options: dict of str:value or None
234  :param bool options["enable_profiling"]: whether or not to set up the \
235  OpenCL environment with the profiling option enabled.
236  :param bool options["out_of_order"]: whether or not to set up the \
237  OpenCL environment with the out_of_order option enabled.
238  :param bool options["end_barrier"]: whether or not to add an OpenCL \
239  barrier at the end of the transformed invoke.
240 
241  '''
242  if not options:
243  options = {}
244 
245  self.validatevalidatevalidate(node, options)
246  api_config = Config.get().api_conf("gocean1.0")
247 
248  # Update class attributes
249  if 'enable_profiling' in options:
250  self._enable_profiling_enable_profiling_enable_profiling = options['enable_profiling']
251 
252  if 'out_of_order' in options:
253  self._out_of_order_out_of_order_out_of_order = options['out_of_order']
254 
255  self._transformed_invokes_transformed_invokes += 1
256 
257  # Get end_barrier option
258  end_barrier = options.get('end_barrier', True)
259 
260  # Update the maximum value that the queue_number have.
261  for kernel in node.coded_kernels():
262  self._max_queue_number_max_queue_number_max_queue_number = max(self._max_queue_number_max_queue_number_max_queue_number,
263  kernel.opencl_options["queue_number"])
264 
265  # Insert, if they don't already exist, the necessary OpenCL helper
266  # subroutines in the root Container.
267  psy_init = self._insert_opencl_init_routine_insert_opencl_init_routine(node.root)
268  init_grid = self._insert_initialise_grid_buffers_insert_initialise_grid_buffers(node.root)
269  write_grid_buf = self._insert_write_grid_buffers_insert_write_grid_buffers(node.root)
270  self._insert_ocl_read_from_device_function_insert_ocl_read_from_device_function(node.root)
271  self._insert_ocl_write_to_device_function_insert_ocl_write_to_device_function(node.root)
272  init_buf = self._insert_ocl_initialise_buffer_insert_ocl_initialise_buffer(node.root)
273 
274  for kern in node.coded_kernels():
275  self._insert_ocl_arg_setter_routine_insert_ocl_arg_setter_routine(node.root, kern)
276 
277  # Insert fortcl, clfortran and c_iso_binding import statement
278  fortcl = ContainerSymbol("fortcl")
279  node.symbol_table.add(fortcl)
280  get_num_cmd_queues = RoutineSymbol(
281  "get_num_cmd_queues", interface=ImportInterface(fortcl))
282  get_cmd_queues = RoutineSymbol(
283  "get_cmd_queues", interface=ImportInterface(fortcl))
284  get_kernel_by_name = RoutineSymbol(
285  "get_kernel_by_name", interface=ImportInterface(fortcl))
286  node.symbol_table.add(get_num_cmd_queues)
287  node.symbol_table.add(get_cmd_queues)
288  node.symbol_table.add(get_kernel_by_name)
289  clfortran = ContainerSymbol("clfortran")
290  node.symbol_table.add(clfortran)
291  cl_finish = RoutineSymbol(
292  "clFinish", interface=ImportInterface(clfortran))
293  cl_launch = RoutineSymbol(
294  "clEnqueueNDRangeKernel",
295  interface=ImportInterface(clfortran))
296  node.symbol_table.add(cl_finish)
297  node.symbol_table.add(cl_launch)
298  iso_c_binding = ContainerSymbol("iso_c_binding")
299  node.symbol_table.add(iso_c_binding)
300  c_loc = RoutineSymbol(
301  "C_LOC", interface=ImportInterface(iso_c_binding))
302  c_null = DataSymbol(
303  "C_NULL_PTR", datatype=INTEGER_TYPE,
304  interface=ImportInterface(iso_c_binding))
305  node.symbol_table.add(c_loc)
306  node.symbol_table.add(c_null)
307 
308  # Include the check_status subroutine if we are in debug_mode
309  if api_config.debug_mode:
310  ocl_utils = ContainerSymbol("ocl_utils_mod")
311  check_status = RoutineSymbol(
312  "check_status", interface=ImportInterface(ocl_utils))
313  node.symbol_table.add(ocl_utils)
314  node.symbol_table.add(check_status)
315 
316  # Declare local variables needed by an OpenCL PSy-layer invoke
317  qlist = node.symbol_table.new_symbol(
318  "cmd_queues", symbol_type=DataSymbol,
319  datatype=UnsupportedFortranType(
320  "integer(kind=c_intptr_t), pointer, save :: cmd_queues(:)"),
321  tag="opencl_cmd_queues")
322  # 'first_time' needs to be an UnsupportedFortranType because it has
323  # SAVE and initial value
324  first = DataSymbol("first_time",
325  datatype=UnsupportedFortranType(
326  "logical, save :: first_time = .true."))
327  node.symbol_table.add(first, tag="first_time")
328  flag = node.symbol_table.new_symbol(
329  "ierr", symbol_type=DataSymbol, datatype=INTEGER_TYPE,
330  tag="opencl_error")
331  global_size = node.symbol_table.new_symbol(
332  "globalsize", symbol_type=DataSymbol,
333  datatype=UnsupportedFortranType(
334  "integer(kind=c_size_t), target :: globalsize(2)"))
335  local_size = node.symbol_table.new_symbol(
336  "localsize", symbol_type=DataSymbol,
337  datatype=UnsupportedFortranType(
338  "integer(kind=c_size_t), target :: localsize(2)"))
339 
340  # Bring all the boundaries at the beginning (since we are going to
341  # use them during the setup block - and they don't change)
342  boundary_vars = []
343  for tag, symbol in node.symbol_table.tags_dict.items():
344  if tag.startswith(("xstart_", "xstop_", "ystart_", "ystop_")):
345  boundary_vars.append(symbol)
346  cursor = 0
347  for assignment in node.walk(Assignment):
348  if assignment.lhs.symbol in boundary_vars:
349  node.children.insert(cursor, assignment.detach())
350  cursor += 1
351 
352  # Create block of code to execute only the first time:
353  setup_block = IfBlock.create(Reference(first), [])
354  setup_block.preceding_comment = \
355  "Initialise OpenCL runtime, kernels and buffers"
356  node.children.insert(cursor, setup_block)
357  setup_block.if_body.addchild(Call.create(psy_init, []))
358 
359  # Set up cmd_queues pointer
360  ptree = Fortran2003.Pointer_Assignment_Stmt(
361  f"{qlist.name} => {get_cmd_queues.name}()")
362  cblock = CodeBlock([ptree], CodeBlock.Structure.STATEMENT)
363  setup_block.if_body.addchild(cblock)
364 
365  # Declare and assign kernel pointers
366  for kern in node.coded_kernels():
367  name = "kernel_" + kern.name
368  try:
369  kpointer = node.symbol_table.lookup_with_tag(name)
370  except KeyError:
371  pointer_type = UnsupportedFortranType(
372  "INTEGER(KIND=c_intptr_t), TARGET, SAVE :: " + name)
373  kpointer = DataSymbol(name, datatype=pointer_type)
374  node.symbol_table.add(kpointer, tag=name)
375  setup_block.if_body.addchild(
376  Assignment.create(
377  Reference(kpointer),
378  Call.create(get_kernel_by_name,
379  [Literal(kern.name, CHARACTER_TYPE)])))
380 
381  # Traverse all arguments and make sure all the buffers are initialised
382  initialised_fields = set()
383  there_is_a_grid_buffer = False
384  for kern in node.coded_kernels():
385  for arg in kern.arguments.args:
386  if arg.argument_type == "field":
387  field = node.symbol_table.lookup(arg.name)
388  if field not in initialised_fields:
389  # Call the init_buffer routine with this field
390  call = Call.create(init_buf, [Reference(field)])
391  setup_block.if_body.addchild(call)
392  initialised_fields.add(field)
393  elif (arg.argument_type == "grid_property" and
394  not arg.is_scalar):
395  if not there_is_a_grid_buffer:
396  # Call the grid init_buffer routine
397  field = node.symbol_table.lookup(
398  kern.arguments.find_grid_access().name)
399  call = Call.create(init_grid, [Reference(field)])
400  setup_block.if_body.addchild(call)
401  there_is_a_grid_buffer = True
402  if not arg.is_scalar:
403  # All buffers will be assigned to a local OpenCL memory
404  # object to easily reference them, make sure this local
405  # variable is declared in the Invoke.
406  name = arg.name + "_cl_mem"
407  try:
408  node.symbol_table.lookup_with_tag(name)
409  except KeyError:
410  node.symbol_table.new_symbol(
411  name, tag=name, symbol_type=DataSymbol,
412  datatype=UnsupportedFortranType(
413  "INTEGER(KIND=c_intptr_t) :: " + name))
414 
415  # Now call all the set_args routines because in some platforms (e.g.
416  # in Xilinx FPGA) knowing which arguments each kernel is going to use
417  # allows the write operation to place the data into the appropriate
418  # memory bank.
419  first_statement_comment = False
420  kernel_names = set()
421  for kern in node.coded_kernels():
422  if kern.name not in kernel_names:
423  kernel_names.add(kern.name)
424  callblock = self._generate_set_args_call_generate_set_args_call(kern, node.scope)
425  for child in callblock.pop_all_children():
426  setup_block.if_body.addchild(child)
427  if not first_statement_comment:
428  child.preceding_comment = (
429  "Do a set_args now so subsequent writes place the "
430  "data appropriately")
431  first_statement_comment = True
432 
433  # Now we can insert calls to write_to_device method for each buffer
434  # and the grid writing call if there is one (in a new first time block)
435  first_statement_comment = False
436  for field in initialised_fields:
437  call = Call.create(
438  RoutineSymbol(field.name+"%write_to_device"), [])
439  setup_block.if_body.addchild(call)
440  if not first_statement_comment:
441  call.preceding_comment = "Write data to the device"
442  first_statement_comment = True
443 
444  if there_is_a_grid_buffer:
445  fieldarg = node.coded_kernels()[0].arguments.find_grid_access()
446  field = node.symbol_table.lookup(fieldarg.name)
447  call = Call.create(write_grid_buf, [Reference(field)])
448  setup_block.if_body.addchild(call)
449 
450  # We will just mark the nodes we are replacing as deleting them inside
451  # the loop would break the PSy-layer backward_dependency method in the
452  # following iterations. We will detach all these nodes after the loop.
453  nodes_to_detach = []
454 
455  # Transform each kernel call loop construct to its equivalent FortCL
456  # statements
457  for kern in node.coded_kernels():
458  outerloop = kern.ancestor(GOLoop).ancestor(GOLoop)
459 
460  # Set up globalsize and localsize arrays
461  garg = node.coded_kernels()[0].arguments.find_grid_access()
462  num_x = api_config.grid_properties["go_grid_nx"].fortran\
463  .format(garg.name)
464  num_y = api_config.grid_properties["go_grid_ny"].fortran\
465  .format(garg.name)
466  assig = Assignment.create(
467  Reference(global_size),
468  Literal(f"(/{num_x}, {num_y}/)",
469  ArrayType(INTEGER_TYPE, [2])))
470  node.children.insert(outerloop.position, assig)
471  local_size_value = kern.opencl_options['local_size']
472  assig = Assignment.create(
473  Reference(local_size),
474  Literal(f"(/{local_size_value}, 1/)",
475  ArrayType(INTEGER_TYPE, [2])))
476  node.children.insert(outerloop.position, assig)
477 
478  # Check that the global_size is multiple of the local_size
479  if api_config.debug_mode:
480  fortran_reader = FortranReader()
481  global_size_expr = fortran_reader.psyir_from_expression(
482  num_x, node.symbol_table)
483  self._add_divisibility_check_add_divisibility_check(node, outerloop.position,
484  check_status, global_size_expr,
485  local_size_value)
486 
487  # Retrieve kernel symbol
488  kernelsym = node.symbol_table.lookup_with_tag(
489  "kernel_" + kern.name)
490 
491  # Choose the command queue number to which to dispatch this kernel.
492  # We have do deal with possible dependencies to kernels dispatched
493  # in different command queues as the order of execution is not
494  # guaranteed.
495  queue_number = kern.opencl_options['queue_number']
496  cmd_queue = ArrayReference.create(
497  qlist, [Literal(str(queue_number), INTEGER_TYPE)])
498  dependency = outerloop.backward_dependence()
499 
500  # If the dependency is a loop containing a kernel, add a barrier if
501  # the previous kernels were dispatched in a different command queue
502  if dependency:
503  for kernel_dep in dependency.coded_kernels():
504  previous_queue = kernel_dep.opencl_options['queue_number']
505  if previous_queue != queue_number:
506  # If the backward dependency is being executed in
507  # another queue we add a barrier to make sure the
508  # previous kernel has finished before this halo
509  # exchange starts.
510  barrier = Assignment.create(
511  Reference(flag),
512  Call.create(cl_finish, [
513  ArrayReference.create(qlist, [
514  Literal(str(previous_queue),
515  INTEGER_TYPE)])]))
516  node.children.insert(outerloop.position, barrier)
517 
518  # If the dependency is something other than a kernel, currently we
519  # dispatch everything else to queue _OCL_MANAGEMENT_QUEUE, so add a
520  # barrier if this kernel is not on queue _OCL_MANAGEMENT_QUEUE.
521  if dependency and not dependency.coded_kernels() and \
522  queue_number != self._OCL_MANAGEMENT_QUEUE_OCL_MANAGEMENT_QUEUE:
523  barrier = Assignment.create(
524  Reference(flag),
525  Call.create(cl_finish, [
526  ArrayReference.create(qlist, [
527  Literal(str(self._OCL_MANAGEMENT_QUEUE_OCL_MANAGEMENT_QUEUE),
528  INTEGER_TYPE)])]))
529  node.children.insert(outerloop.position, barrier)
530 
531  # Check that everything has succeeded before the kernel launch
532  if api_config.debug_mode:
533  self._add_ready_check_add_ready_check(node, outerloop.position, check_status,
534  kern.name, flag, cl_finish,
535  cmd_queue.copy())
536  callblock = self._generate_set_args_call_generate_set_args_call(kern, node.scope)
537  for child in callblock.pop_all_children():
538  node.children.insert(outerloop.position, child)
539 
540  # Then we call the clEnqueueNDRangeKernel
541  assig = Assignment.create(
542  Reference(flag),
543  Call.create(cl_launch, [
544  # OpenCL Command Queue
545  cmd_queue,
546  # OpenCL Kernel object
547  Reference(kernelsym),
548  # Number of work dimensions
549  Literal("2", INTEGER_TYPE),
550  # Global offset (if NULL the global IDs start at
551  # offset (0,0,0))
552  Reference(c_null),
553  # Global work size
554  Call.create(c_loc, [Reference(global_size)]),
555  # Local work size
556  Call.create(c_loc, [Reference(local_size)]),
557  # Number of events in wait list
558  Literal("0", INTEGER_TYPE),
559  # Event wait list that need to be completed before
560  # this kernel
561  Reference(c_null),
562  # Event that identifies this kernel completion
563  Reference(c_null)]))
564  assig.preceding_comment = "Launch the kernel"
565  node.children.insert(outerloop.position, assig)
566  self._insert_kernel_code_in_opencl_file_insert_kernel_code_in_opencl_file(kern)
567 
568  # Add additional checks if we are in debug mode
569  if api_config.debug_mode:
570  self._add_kernel_check_add_kernel_check(node, outerloop.position, check_status,
571  kern.name, flag, cl_finish,
572  cmd_queue.copy())
573 
574  nodes_to_detach.append(outerloop)
575 
576  # If we execute the kernels asynchronously, we need to add wait
577  # statements before the halo exchanges to guarantee that the data
578  # has been updated
579  for possible_dependent_node in node.walk(HaloExchange):
580  # The backward_dependences returns the last Loop with a kernel
581  # that has a dependency with this halo exchange
582  dependency = possible_dependent_node.backward_dependence()
583  if dependency:
584  for kernel_dep in dependency.coded_kernels():
585  previous_queue = kernel_dep.opencl_options['queue_number']
586  if previous_queue != self._OCL_MANAGEMENT_QUEUE_OCL_MANAGEMENT_QUEUE:
587  # If the backward dependency is being executed in
588  # another queue we add a barrier to make sure the
589  # previous kernel has finished before this one starts.
590  barrier = Assignment.create(
591  Reference(flag),
592  Call.create(cl_finish, [
593  ArrayReference.create(qlist, [
594  Literal(str(previous_queue),
595  INTEGER_TYPE)])]))
596  pos = possible_dependent_node.position
597  node.children.insert(pos, barrier)
598 
599  for node_to_detach in nodes_to_detach:
600  node_to_detach.detach()
601 
602  if end_barrier:
603  self._add_end_barrier_add_end_barrier(node, flag, cl_finish, qlist)
604 
605  # And at the very end always makes sure that first_time value is False
606  assign = Assignment.create(Reference(first),
607  Literal("false", BOOLEAN_TYPE))
608  assign.preceding_comment = "Unset the first time flag"
609  node.addchild(assign)
610 
611  self._output_opencl_kernels_file_output_opencl_kernels_file()
612 
613  def _add_end_barrier(self, node, flag, cl_finish, qlist):
614  ''' Append into the given node a OpenCL Wait operation for each of
615  the OpenCL queues in use.
616 
617  :param node: PSyIR node where to append the barrier.
618  :type node: :py:class:`psyclone.psyir.nodes.Schedule`
619  :param flag: PSyIR symbol to use as flag.
620  :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol`
621  :param cl_finish: PSyIR symbol of the barrier routine.
622  :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
623  :param qlist: PSyIR symbol of the OpenCL queues array.
624  :type qlist: :py:class:`psyclone.psyir.symbols.DataSymbol`
625 
626  '''
627  # We need a clFinish for each of the queues in the implementation
628  added_comment = False
629  for num in range(1, self._max_queue_number_max_queue_number_max_queue_number + 1):
630  queue = ArrayReference.create(qlist, [Literal(str(num),
631  INTEGER_TYPE)])
632  node.addchild(
633  Assignment.create(
634  Reference(flag), Call.create(cl_finish, [queue])))
635  if not added_comment:
636  node.children[-1].preceding_comment = \
637  "Wait until all kernels have finished"
638  added_comment = True
639 
640  @staticmethod
641  def _add_divisibility_check(node, position, check_status, global_size_expr,
642  local_size):
643  ''' Insert into node a check that the global_size is exactly
644  divisible by the local size.
645 
646  :param node: where to insert the divisibility check.
647  :type node: :py:class:`psyclone.psyir.nodes.Schedule`
648  :param int position: location where to insert the divisibilitay check.
649  :param check_status: PSyIR symbol of the check routine.
650  :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
651  :param global_size_expr: PSyIR representing the global_size.
652  :type global_size_expr: :py:class:`psyclone.psyir.nodes.DataNode`
653  :param int local_size: size of the OpenCL local work_group.
654 
655  '''
656  check = BinaryOperation.create(
657  BinaryOperation.Operator.NE,
658  IntrinsicCall.create(
659  IntrinsicCall.Intrinsic.MOD,
660  [global_size_expr,
661  Literal(str(local_size), INTEGER_TYPE)]
662  ),
663  Literal("0", INTEGER_TYPE))
664  message = ("Global size is not a multiple of local size ("
665  "mandatory in OpenCL < 2.0).")
666  error = Call.create(check_status,
667  [Literal(message, CHARACTER_TYPE),
668  Literal("-1", INTEGER_TYPE)])
669  ifblock = IfBlock.create(check, [error])
670  node.children.insert(position, ifblock)
671 
672  @staticmethod
673  def _add_kernel_check(node, position, check_status, kernel_name,
674  flag, cl_finish, cmd_queue):
675  ''' Insert into node a check that the kernel has been launched and
676  has been executed successfully.
677 
678  :param node: where to insert the kernel check.
679  :type node: :py:class:`psyclone.psyir.nodes.Schedule`
680  :param int position: location where to insert the kernel check.
681  :param check_status: PSyIR symbol of the check routine.
682  :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
683  :param str kernel_name: name of the kernel being checked.
684  :param flag: PSyIR symbol to use as flag.
685  :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol`
686  :param cl_finish: PSyIR symbol of the barrier routine.
687  :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
688  :param cmd_queue: PSyIR symbol of the OpenCL command queues array.
689  :type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol`
690 
691  '''
692  # First check the launch return value
693  message = Literal(f"{kernel_name} clEnqueueNDRangeKernel",
694  CHARACTER_TYPE)
695  check = Call.create(check_status, [message, Reference(flag)])
696  node.children.insert(position, check)
697 
698  # Then add a barrier
699  barrier = Assignment.create(
700  Reference(flag),
701  Call.create(cl_finish, [cmd_queue]))
702  node.children.insert(position + 1, barrier)
703 
704  # And check the kernel executed successfully
705  message = Literal(f"Errors during {kernel_name}", CHARACTER_TYPE)
706  check = Call.create(check_status, [message, Reference(flag)])
707  node.children.insert(position + 2, check)
708 
709  @staticmethod
710  def _add_ready_check(node, position, check_status, kernel_name,
711  flag, cl_finish, cmd_queue):
712  ''' Insert into node a check that verifies if everything in the
713  command queues previous to a kernel launch has completed successfully.
714 
715  :param node: where to insert the kernel check.
716  :type node: :py:class:`psyclone.psyir.nodes.Schedule`
717  :param int position: location where to insert the kernel check.
718  :param check_status: PSyIR symbol of the check routine.
719  :type check_status: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
720  :param str kernel_name: name of the kernel being checked.
721  :param flag: PSyIR symbol to use as flag.
722  :type flag: :py:class:`psyclone.psyir.symbols.DataSymbol`
723  :param cl_finish: PSyIR symbol of the barrier routine.
724  :type cl_finish: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
725  :param cmd_queue: PSyIR symbol of the OpenCL command queues array.
726  :type cmd_queue: :py:class:`psyclone.psyir.symbols.DataSymbol`
727 
728  '''
729  barrier = Assignment.create(
730  Reference(flag),
731  Call.create(cl_finish, [cmd_queue]))
732  node.children.insert(position, barrier)
733  message = Literal(f"Errors before {kernel_name} launch",
734  CHARACTER_TYPE)
735  check = Call.create(check_status, [message, Reference(flag)])
736  node.children.insert(position + 1, check)
737 
738  def _insert_kernel_code_in_opencl_file(self, kernel):
739  ''' Insert the given kernel into a OpenCL file. For this we need
740  to remove the 'go_wp' precision symbol which can't be generated
741  by OpenCL. We assume 'go_wp' is a OpenCL double.
742 
743  :param kernel: the kernel to insert.
744  :type kernel: :py:class:`psyclone.psyir.nodes.KernelSchedule`
745 
746  '''
747  if not self._kernels_file_kernels_file:
748  self._kernels_file_kernels_file = FileContainer("opencl_kernels")
749 
750  # Create a copy of the kernel and remove precision symbols since they
751  # are not supported in the OpenCL backend.
752  kernel_copy = kernel.get_kernel_schedule().copy()
753  symtab = kernel_copy.symbol_table
754 
755  # TODO #898: Removing symbols is not properly supported by PSyIR
756  # because we have to deal with all references to it. In this case we
757  # implement manually a conversion of all 'go_wp' to a double precision
758  # and remove the symbol because we guarantee that it just appear in the
759  # declarations of other symbols (symtab.datasymbols).
760  # pylint: disable=protected-access
761  for sym in symtab.datasymbols:
762  # Not all types have the 'precision' attribute (e.g.
763  # UnresolvedType)
764  if (hasattr(sym.datatype, "precision") and
765  isinstance(sym.datatype.precision, DataSymbol)):
766  sym.datatype._precision = ScalarType.Precision.DOUBLE
767 
768  if 'go_wp' in symtab:
769  del symtab._symbols['go_wp']
770 
771  # Insert kernel in the OpenCL kernels file if it doesn't already exist
772  for routine in self._kernels_file_kernels_file.walk(Routine):
773  if routine.name == kernel.name:
774  break # if it exist re-use existing one
775  # TODO 1572: Here we assume that in the same Invoke (scope) a
776  # kernel with the same name will be the same kernel, but that
777  # may not be true when doing multiple invokes.
778  else:
779  self._kernels_file_kernels_file.addchild(kernel_copy)
780 
781  def _output_opencl_kernels_file(self):
782  ''' Write the OpenCL kernels to a file using the OpenCL backend.
783 
784  '''
785  # TODO 1013: The code below duplicates some logic of the CodedKern
786  # rename_and_write method. Ideally this should be moved out of
787  # the AST and transformations and put into some kind of IOManager.
788 
789  ocl_writer = OpenCLWriter(kernels_local_size=64)
790  new_kern_code = ocl_writer(self._kernels_file_kernels_file)
791 
792  fdesc = None
793  name_idx = -1
794  while not fdesc:
795  name_idx += 1
796  new_name = f"opencl_kernels_{name_idx}.cl"
797 
798  try:
799  # Atomically attempt to open the new kernel file (in case
800  # this is part of a parallel build)
801  fdesc = os.open(
802  os.path.join(Config.get().kernel_output_dir, new_name),
803  os.O_CREAT | os.O_WRONLY | os.O_EXCL)
804  except (OSError, IOError):
805  # The os.O_CREATE and os.O_EXCL flags in combination mean
806  # that open() raises an error if the file exists
807  continue
808 
809  # Write the modified AST out to file
810  os.write(fdesc, new_kern_code.encode())
811  # Close the new kernel file
812  os.close(fdesc)
813 
814  @staticmethod
815  def _generate_set_args_call(kernel, scope):
816  '''
817  Generate the Call statement to the set_args subroutine for the
818  provided kernel.
819 
820  :param kernel: the kernel for which to generate a call to its \
821  arg_setter subroutine.
822  :type kernel: :py:class:`psyclone.psyGen.CodedKern`
823  :param scope: The node representing the scope where the call \
824  statements will be inserted.
825  :type scope: :py:class:`psyclone.psyir.nodes.ScopingNode`
826 
827  :returns: a block of statements that represent the set_args call
828  :rtype: :py:class:`psyclone.psyir.nodes.Schedule`
829 
830  '''
831  call_block = Schedule()
832 
833  # Retrieve symbol table and kernel symbol
834  symtab = scope.symbol_table
835  kernelsym = symtab.lookup_with_tag("kernel_" + kernel.name)
836 
837  # Find the symbol that defines each boundary for this kernel.
838  # In OpenCL the iteration boundaries are passed as arguments to the
839  # kernel because the global work size may exceed the dimensions and
840  # therefore the updates outside the boundaries should be masked.
841  # If any of the boundaries is not found, it can not proceed.
842  boundaries = []
843  try:
844  for boundary in ["xstart", "xstop", "ystart", "ystop"]:
845  tag = boundary + "_" + kernel.name
846  symbol = symtab.lookup_with_tag(tag)
847  boundaries.append(symbol.name)
848  except KeyError as err:
849  raise GenerationError(
850  f"Boundary symbol tag '{tag}' not found while generating the "
851  f"OpenCL code for kernel '{kernel.name}'. Make sure to apply "
852  f"the GOMoveIterationBoundariesInsideKernelTrans before "
853  f"attempting the OpenCL code generation.") from err
854 
855  api_config = Config.get().api_conf("gocean1.0")
856  # Prepare the argument list for the set_args routine
857  arguments = [Reference(kernelsym)]
858  for arg in kernel.arguments.args:
859  if arg.argument_type == "scalar":
860  if arg.name in boundaries:
861  # Boundary values are 0-indexed in OpenCL and 1-indexed in
862  # PSyIR, therefore we need to subtract 1
863  bop = BinaryOperation.create(BinaryOperation.Operator.SUB,
864  arg.psyir_expression(),
865  Literal("1", INTEGER_TYPE))
866  arguments.append(bop)
867  else:
868  arguments.append(arg.psyir_expression())
869  elif arg.argument_type == "field":
870  # Cast buffer to cl_mem type expected by OpenCL
871  field = symtab.lookup(arg.name)
872  symbol = symtab.lookup_with_tag(arg.name + "_cl_mem")
873  source = StructureReference.create(field, ['device_ptr'])
874  dest = Reference(symbol)
875  icall = IntrinsicCall.create(IntrinsicCall.Intrinsic.TRANSFER,
876  [source, dest])
877  assig = Assignment.create(dest.copy(), icall)
878  call_block.addchild(assig)
879  arguments.append(Reference(symbol))
880  elif arg.argument_type == "grid_property":
881  garg = kernel.arguments.find_grid_access()
882  if arg.is_scalar:
883  # pylint: disable=protected-access
884  arguments.append(
885  StructureReference.create(
886  symtab.lookup(garg.name),
887  api_config.grid_properties[arg._property_name]
888  .fortran.split('%')[1:]
889  ))
890  else:
891  # Cast grid buffer to cl_mem type expected by OpenCL
892  device_grid_property = arg.name + "_device"
893  field = symtab.lookup(garg.name)
894  source = StructureReference.create(
895  field, ['grid', device_grid_property])
896  symbol = symtab.lookup_with_tag(arg.name + "_cl_mem")
897  dest = Reference(symbol)
898  icall = IntrinsicCall.create(
899  IntrinsicCall.Intrinsic.TRANSFER,
900  [source, dest])
901  assig = Assignment.create(dest.copy(), icall)
902  call_block.addchild(assig)
903  arguments.append(Reference(symbol))
904 
905  call_symbol = symtab.lookup_with_tag(kernel.name + "_set_args")
906  call_block.addchild(Call.create(call_symbol, arguments))
907  return call_block
908 
909  @staticmethod
910  def _insert_ocl_arg_setter_routine(node, kernel):
911  '''
912  Returns the symbol of the subroutine that sets the OpenCL kernel
913  arguments for the provided PSy-layer kernel using FortCL. If the
914  subroutine doesn't exist it also generates it.
915 
916  :param node: the container where the new subroutine will be inserted.
917  :type node: :py:class:`psyclone.psyir.nodes.Container`
918  :param kernel: the kernel call for which to provide the arg_setter \
919  subroutine.
920  :type kernel: :py:class:`psyclone.psyGen.CodedKern`
921 
922  :returns: the symbol representing the arg_setter subroutine.
923  :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
924 
925  '''
926  # Check if the subroutine already exist.
927  sub_name = kernel.name + "_set_args"
928  try:
929  return node.symbol_table.lookup_with_tag(sub_name)
930  except KeyError:
931  # If the Symbol does not exist, the rest of this method
932  # will generate it.
933  pass
934 
935  # Create the new Routine and RoutineSymbol
936  node.symbol_table.add(RoutineSymbol(sub_name), tag=sub_name)
937  argsetter = Routine(sub_name)
938  arg_list = []
939 
940  # Add subroutine imported symbols
941  clfortran = ContainerSymbol("clfortran")
942  clsetkernelarg = RoutineSymbol("clSetKernelArg",
943  interface=ImportInterface(clfortran))
944  iso_c = ContainerSymbol("iso_c_binding")
945  c_sizeof = RoutineSymbol("C_SIZEOF", interface=ImportInterface(iso_c))
946  c_loc = RoutineSymbol("C_LOC", interface=ImportInterface(iso_c))
947  c_intptr_t = RoutineSymbol("c_intptr_t",
948  interface=ImportInterface(iso_c))
949  ocl_utils = ContainerSymbol("ocl_utils_mod")
950  check_status = RoutineSymbol("check_status",
951  interface=ImportInterface(ocl_utils))
952  argsetter.symbol_table.add(clfortran)
953  argsetter.symbol_table.add(clsetkernelarg)
954  argsetter.symbol_table.add(iso_c)
955  argsetter.symbol_table.add(c_sizeof)
956  argsetter.symbol_table.add(c_loc)
957  argsetter.symbol_table.add(c_intptr_t)
958  argsetter.symbol_table.add(ocl_utils)
959  argsetter.symbol_table.add(check_status)
960 
961  # Add an argument symbol for the kernel object
962  kobj = argsetter.symbol_table.new_symbol(
963  "kernel_obj", symbol_type=DataSymbol,
964  interface=ArgumentInterface(ArgumentInterface.Access.READ),
965  datatype=UnsupportedFortranType(
966  "INTEGER(KIND=c_intptr_t), TARGET :: kernel_obj"))
967  arg_list.append(kobj)
968 
969  # Include each kernel call argument as an argument of this routine
970  for arg in kernel.arguments.args:
971 
972  name = argsetter.symbol_table.next_available_name(arg.name)
973 
974  # This function requires 'TARGET' annotated declarations which are
975  # not supported in the PSyIR, so we build them as
976  # UnsupportedFortranType for now.
977  if arg.is_scalar and arg.intrinsic_type == "real":
978  pointer_type = UnsupportedFortranType(
979  "REAL(KIND=go_wp), INTENT(IN), TARGET :: " + name)
980  elif arg.is_scalar:
981  pointer_type = UnsupportedFortranType(
982  "INTEGER, INTENT(IN), TARGET :: " + name)
983  else:
984  # Everything else is a cl_mem pointer (c_intptr_t)
985  pointer_type = UnsupportedFortranType(
986  "INTEGER(KIND=c_intptr_t), INTENT(IN), TARGET :: " + name)
987 
988  new_arg = DataSymbol(
989  name, datatype=pointer_type,
990  interface=ArgumentInterface(ArgumentInterface.Access.READ))
991  argsetter.symbol_table.add(new_arg)
992  arg_list.append(new_arg)
993 
994  argsetter.symbol_table.specify_argument_list(arg_list)
995 
996  # Create the ierr local variable
997  ierr = argsetter.symbol_table.new_symbol(
998  "ierr", symbol_type=DataSymbol, datatype=INTEGER_TYPE)
999 
1000  # Call the clSetKernelArg for each argument and a check_status to
1001  # see if the OpenCL call has succeeded
1002  for index, variable in enumerate(arg_list[1:]):
1003  call = Call.create(clsetkernelarg,
1004  [Reference(kobj),
1005  Literal(str(index), INTEGER_TYPE),
1006  Call.create(c_sizeof, [Reference(variable)]),
1007  Call.create(c_loc, [Reference(variable)])])
1008  assignment = Assignment.create(Reference(ierr), call)
1009  argsetter.addchild(assignment)
1010  emsg = f"clSetKernelArg: arg {index} of {kernel.name}"
1011  call = Call.create(check_status, [Literal(emsg, CHARACTER_TYPE),
1012  Reference(ierr)])
1013  argsetter.addchild(call)
1014 
1015  argsetter.children[0].preceding_comment = \
1016  f"Set the arguments for the {kernel.name} OpenCL Kernel"
1017 
1018  # Add the subroutine as child of the provided node
1019  node.addchild(argsetter)
1020 
1021  return node.symbol_table.lookup_with_tag(sub_name)
1022 
1023  def _insert_opencl_init_routine(self, node):
1024  '''
1025  Returns the symbol of the subroutine that initialises the OpenCL
1026  environment using FortCL. If the subroutine doesn't exist it also
1027  generates it.
1028 
1029  :param node: the container where the new subroutine will be inserted.
1030  :type node: :py:class:`psyclone.psyir.nodes.Container`
1031 
1032  :returns: the symbol representing the OpenCL initialisation subroutine.
1033  :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
1034 
1035  '''
1036  symtab = node.symbol_table
1037  try:
1038  # TODO #1572: The ocl_init routine may need to be regenerated if
1039  # there are multiple Invokes because _max_queue_number may have
1040  # increased and we need to load the kernels of both invokes.
1041  return symtab.lookup_with_tag("ocl_init_routine")
1042  except KeyError:
1043  # If the Symbol does not exist, the rest of this method
1044  # will generate it.
1045  pass
1046 
1047  # Create the symbol for the routine and add it to the symbol table.
1048  subroutine_name = symtab.new_symbol("psy_init",
1049  symbol_type=RoutineSymbol,
1050  tag="ocl_init_routine").name
1051 
1052  # Choose a round-robin device number if it has MPI and multiple
1053  # accelerators.
1054  distributed_memory = Config.get().distributed_memory
1055  devices_per_node = Config.get().ocl_devices_per_node
1056  additional_uses = ""
1057  additional_stmts = ""
1058  if devices_per_node > 1 and distributed_memory:
1059  additional_uses += "USE parallel_mod, ONLY: get_rank"
1060  additional_stmts += \
1061  f"ocl_device_num = mod(get_rank()-1, {devices_per_node}) + 1"
1062 
1063  # Get a set of all kernel names in the Container. This implementation
1064  # currently assumes all of them will be available in OpenCL
1065  unique_kernels = {kernel.name for kernel in node.coded_kernels()}
1066 
1067  # Code of the subroutine in Fortran
1068  code = f'''
1069  subroutine psy_init()
1070  {additional_uses}
1071  use fortcl, only: ocl_env_init, add_kernels
1072  character(len=30) kernel_names({len(unique_kernels)})
1073  integer :: ocl_device_num=1
1074  logical, save :: initialised=.false.
1075  ! Check to make sure we only execute this routine once
1076  if (.not. initialised) then
1077  initialised = .true.
1078  ! Initialise the opencl environment/device
1079  {additional_stmts}
1080  call ocl_env_init({self._max_queue_number}, ocl_device_num, &
1081  {".true." if self._enable_profiling else ".false."}, &
1082  {".true." if self._out_of_order else ".false."})
1083  ! The kernels this psy layer module requires
1084  '''
1085 
1086  for index, kernel_name in enumerate(unique_kernels):
1087  code += f"kernel_names({index + 1}) = \"{kernel_name}\"\n"
1088 
1089  code += f'''\
1090  ! Create the opencl kernel objects. This expects to find all of
1091  ! the compiled kernels in FORTCL_KERNELS_FILE environment variable
1092  call add_kernels({len(unique_kernels)}, kernel_names)
1093  end if
1094  end subroutine psy_init'''
1095 
1096  # Obtain the PSyIR representation of the code above
1097  fortran_reader = FortranReader()
1098  container = fortran_reader.psyir_from_source(code)
1099  subroutine = container.children[0]
1100  # Rename subroutine
1101  subroutine.name = subroutine_name
1102 
1103  # Add the subroutine as child of the provided node
1104  node.addchild(subroutine.detach())
1105 
1106  return symtab.lookup_with_tag("ocl_init_routine")
1107 
1108  @staticmethod
1109  def _insert_initialise_grid_buffers(node):
1110  '''
1111  Returns the symbol of a subroutine that initialises all OpenCL grid
1112  buffers in the OpenCL device using FortCL. If the subroutine doesn't
1113  already exist it also generates it.
1114 
1115  :param node: the container where the new subroutine will be inserted.
1116  :type node: :py:class:`psyclone.psyir.nodes.Container`
1117 
1118  :returns: the symbol of the grid buffer initialisation subroutine.
1119  :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
1120 
1121  '''
1122  # pylint: disable=too-many-locals
1123  symtab = node.symbol_table
1124  try:
1125  return symtab.lookup_with_tag("ocl_init_grid_buffers")
1126  except KeyError:
1127  # If the Symbol does not exist, the rest of this method
1128  # will generate it.
1129  pass
1130 
1131  # Create the symbol for the routine and add it to the symbol table.
1132  subroutine_name = symtab.new_symbol("initialise_grid_device_buffers",
1133  symbol_type=RoutineSymbol,
1134  tag="ocl_init_grid_buffers").name
1135 
1136  # Get the GOcean API property names used in this routine
1137  api_config = Config.get().api_conf("gocean1.0")
1138  props = api_config.grid_properties
1139  num_x = props["go_grid_nx"].fortran.format("field")
1140  num_y = props["go_grid_ny"].fortran.format("field")
1141 
1142  int_arrays = []
1143  real_arrays = []
1144  for key, prop in props.items():
1145  if key == "go_grid_data":
1146  # TODO #676: Ignore because go_grid_data is actually a field
1147  # property
1148  continue
1149  if prop.type == "array" and prop.intrinsic_type == "integer":
1150  int_arrays.append(prop.fortran.format("field"))
1151  elif prop.type == "array" and prop.intrinsic_type == "real":
1152  real_arrays.append(prop.fortran.format("field"))
1153 
1154  # Code of the subroutine in Fortran
1155  code = f'''
1156  subroutine initialise_device_grid(field)
1157  USE fortcl, ONLY: create_ronly_buffer
1158  USE iso_c_binding, only: c_size_t
1159  use field_mod
1160  type(r2d_field), intent(inout), target :: field
1161  integer(kind=c_size_t) size_in_bytes
1162  IF (.not. c_associated({int_arrays[0]}_device)) THEN
1163  ! Create integer grid fields
1164  size_in_bytes = int({num_x}*{num_y}, 8) * &
1165  c_sizeof({int_arrays[0]}(1,1))
1166  '''
1167 
1168  for int_array in int_arrays:
1169  code += f'''
1170  {int_array}_device = transfer( &
1171  create_ronly_buffer(size_in_bytes), {int_array}_device)
1172  '''
1173 
1174  code += f'''
1175  ! Create real grid buffers
1176  size_in_bytes = int({num_x} * {num_y}, 8) * &
1177  c_sizeof({real_arrays[0]}(1,1))
1178  '''
1179 
1180  for real_array in real_arrays:
1181  code += f'''
1182  {real_array}_device = transfer( &
1183  create_ronly_buffer(size_in_bytes), {real_array}_device)
1184  '''
1185 
1186  code += '''
1187  END IF
1188  end subroutine initialise_device_grid
1189  '''
1190 
1191  # Obtain the PSyIR representation of the code above
1192  fortran_reader = FortranReader()
1193  container = fortran_reader.psyir_from_source(code)
1194  subroutine = container.children[0]
1195  # Rename subroutine
1196  subroutine.name = subroutine_name
1197 
1198  # Add the subroutine as child of the provided node
1199  node.addchild(subroutine.detach())
1200 
1201  return symtab.lookup_with_tag("ocl_init_grid_buffers")
1202 
1203  def _insert_write_grid_buffers(self, node):
1204  '''
1205  Returns the symbol of a subroutine that writes the values of the grid
1206  properties into the OpenCL device buffers using FortCL. If the
1207  subroutine doesn't already exist it also generates it.
1208 
1209  :param node: the container where the new subroutine will be inserted.
1210  :type node: :py:class:`psyclone.psyir.nodes.Container`
1211 
1212  :returns: the symbol representing the grid buffers writing subroutine.
1213  :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
1214 
1215  '''
1216  # pylint: disable=too-many-locals
1217  symtab = node.symbol_table
1218  try:
1219  return symtab.lookup_with_tag("ocl_write_grid_buffers")
1220  except KeyError:
1221  # If the Symbol does not exist, the rest of this method
1222  # will generate it.
1223  pass
1224 
1225  # Create the symbol for the routine and add it to the symbol table.
1226  subroutine_name = symtab.new_symbol("write_grid_buffers",
1227  symbol_type=RoutineSymbol,
1228  tag="ocl_write_grid_buffers").name
1229 
1230  # Get the GOcean API property names used in this routine
1231  api_config = Config.get().api_conf("gocean1.0")
1232  props = api_config.grid_properties
1233  num_x = props["go_grid_nx"].fortran.format("field")
1234  num_y = props["go_grid_ny"].fortran.format("field")
1235 
1236  # Code of the subroutine in Fortran
1237  code = f'''
1238  subroutine write_device_grid(field)
1239  USE fortcl, ONLY: get_cmd_queues
1240  use iso_c_binding, only: c_intptr_t, c_size_t, c_sizeof
1241  USE clfortran
1242  USE ocl_utils_mod, ONLY: check_status
1243  type(r2d_field), intent(inout), target :: field
1244  integer(kind=c_size_t) size_in_bytes
1245  INTEGER(c_intptr_t), pointer :: cmd_queues(:)
1246  integer(c_intptr_t) :: cl_mem
1247  integer :: ierr
1248  cmd_queues => get_cmd_queues()
1249  ! Integer grid buffers
1250  size_in_bytes = int({num_x} * {num_y}, 8) * &
1251  c_sizeof(field%grid%tmask(1,1))
1252  cl_mem = transfer(field%grid%tmask_device, cl_mem)
1253  ierr = clEnqueueWriteBuffer( &
1254  cmd_queues({self._OCL_MANAGEMENT_QUEUE}), &
1255  cl_mem, CL_TRUE, 0_8, size_in_bytes, &
1256  C_LOC(field%grid%tmask), 0, C_NULL_PTR, C_NULL_PTR)
1257  CALL check_status("clEnqueueWriteBuffer tmask", ierr)
1258  ! Real grid buffers
1259  size_in_bytes = int({num_x} * {num_y}, 8) * &
1260  c_sizeof(field%grid%area_t(1,1))
1261  '''
1262  write_str = '''
1263  cl_mem = transfer(field%grid%{0}_device, cl_mem)
1264  ierr = clEnqueueWriteBuffer(cmd_queues({1}), &
1265  cl_mem, CL_TRUE, 0_8, size_in_bytes, &
1266  C_LOC(field%grid%{0}), 0, C_NULL_PTR, C_NULL_PTR)
1267  CALL check_status("clEnqueueWriteBuffer {0}_device", ierr)
1268  '''
1269  for grid_prop in ['area_t', 'area_u', 'area_v', 'dx_u', 'dx_v',
1270  'dx_t', 'dy_u', 'dy_v', 'dy_t', 'gphiu', 'gphiv']:
1271  code += write_str.format(grid_prop, self._OCL_MANAGEMENT_QUEUE_OCL_MANAGEMENT_QUEUE)
1272  code += "end subroutine write_device_grid"
1273 
1274  # Obtain the PSyIR representation of the code above
1275  fortran_reader = FortranReader()
1276  container = fortran_reader.psyir_from_source(code)
1277  subroutine = container.children[0]
1278  # Rename subroutine
1279  subroutine.name = subroutine_name
1280 
1281  # Add the subroutine as child of the provided node
1282  node.addchild(subroutine.detach())
1283 
1284  return symtab.lookup_with_tag("ocl_write_grid_buffers")
1285 
1286  def _insert_ocl_read_from_device_function(self, node):
1287  '''
1288  Returns the symbol of a subroutine that retrieves the data back from
1289  an OpenCL device using FortCL. If the subroutine doesn't already exist
1290  it also generates it.
1291 
1292  :param node: the container where the new subroutine will be inserted.
1293  :type node: :py:class:`psyclone.psyir.nodes.Container`
1294 
1295  :returns: the symbol of the buffer data retrieving subroutine.
1296  :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
1297 
1298  '''
1299  symtab = node.symbol_table
1300  try:
1301  return symtab.lookup_with_tag("ocl_read_func")
1302  except KeyError:
1303  # If the subroutines does not exist, it needs to be
1304  # generated first.
1305  pass
1306 
1307  # Create the symbol for the routine and add it to the symbol table.
1308  subroutine_name = symtab.new_symbol("read_from_device",
1309  symbol_type=RoutineSymbol,
1310  tag="ocl_read_func").name
1311 
1312  # Code of the subroutine in Fortran
1313  code = f'''
1314  subroutine read_sub(from, to, startx, starty, nx, ny, blocking)
1315  USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof
1316  USE ocl_utils_mod, ONLY: check_status
1317  use kind_params_mod, only: go_wp
1318  USE clfortran
1319  USE fortcl, ONLY: get_cmd_queues
1320  type(c_ptr), intent(in) :: from
1321  real(go_wp), intent(inout), dimension(:,:), target :: to
1322  integer, intent(in) :: startx, starty, nx, ny
1323  logical, intent(in) :: blocking
1324  INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes
1325  integer(c_intptr_t) :: cl_mem
1326  INTEGER(c_intptr_t), pointer :: cmd_queues(:)
1327  integer :: ierr, i
1328 
1329  ! Give the from pointer the appropriate OpenCL memory object type
1330  cl_mem = transfer(from, cl_mem)
1331  cmd_queues => get_cmd_queues()
1332 
1333  ! Two copy strategies depending on how much of the total length
1334  ! nx covers.
1335  if (nx < size(to, 1) / 2) then
1336  ! Dispatch asynchronous copies of just the contiguous data.
1337  do i = starty, starty+ny
1338  size_in_bytes = int(nx, 8) * c_sizeof(to(1,1))
1339  offset_in_bytes = int(size(to, 1) * (i-1) + (startx-1)) &
1340  * c_sizeof(to(1,1))
1341  ierr = clEnqueueReadBuffer( &
1342  cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &
1343  CL_FALSE, offset_in_bytes, size_in_bytes, &
1344  C_LOC(to(startx, i)), 0, C_NULL_PTR, C_NULL_PTR)
1345  CALL check_status("clEnqueueReadBuffer", ierr)
1346  enddo
1347  if (blocking) then
1348  CALL check_status("clFinish on read", &
1349  clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE})))
1350  endif
1351  else
1352  ! Copy across the whole starty:starty+ny rows in a single
1353  ! copy operation.
1354  size_in_bytes = int(size(to, 1) * ny, 8) * c_sizeof(to(1,1))
1355  offset_in_bytes = int(size(to,1)*(starty-1), 8) &
1356  * c_sizeof(to(1,1))
1357  ierr = clEnqueueReadBuffer( &
1358  cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &
1359  CL_TRUE, offset_in_bytes, size_in_bytes, &
1360  C_LOC(to(1,starty)), 0, C_NULL_PTR, C_NULL_PTR)
1361  CALL check_status("clEnqueueReadBuffer", ierr)
1362  endif
1363  end subroutine read_sub
1364  '''
1365 
1366  # Obtain the PSyIR representation of the code above
1367  fortran_reader = FortranReader()
1368  container = fortran_reader.psyir_from_source(code)
1369  subroutine = container.children[0]
1370 
1371  # Rename subroutine
1372  subroutine.name = subroutine_name
1373 
1374  # Add the subroutine as child of the provided node
1375  node.addchild(subroutine.detach())
1376 
1377  return symtab.lookup_with_tag("ocl_read_func")
1378 
1379  def _insert_ocl_write_to_device_function(self, node):
1380  '''
1381  Returns the symbol of a subroutine that writes the buffer data into
1382  an OpenCL device using FortCL. If the subroutine doesn't already exist
1383  it also generates it.
1384 
1385  :param node: the container where the new subroutine will be inserted.
1386  :type node: :py:class:`psyclone.psyir.nodes.Container`
1387 
1388  :returns: the symbol of the buffer writing subroutine.
1389  :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
1390 
1391  '''
1392  symtab = node.symbol_table
1393  try:
1394  return symtab.lookup_with_tag("ocl_write_func")
1395  except KeyError:
1396  # If the subroutines does not exist, it needs to be
1397  # generated first.
1398  pass
1399 
1400  # Create the symbol for the routine and add it to the symbol table.
1401  subroutine_name = symtab.new_symbol("write_to_device",
1402  symbol_type=RoutineSymbol,
1403  tag="ocl_write_func").name
1404 
1405  # Code of the subroutine in Fortran
1406  code = f'''
1407  subroutine write_sub(from, to, startx, starty, nx, ny, blocking)
1408  USE iso_c_binding, only: c_ptr, c_intptr_t, c_size_t, c_sizeof
1409  USE ocl_utils_mod, ONLY: check_status
1410  use kind_params_mod, only: go_wp
1411  USE clfortran
1412  USE fortcl, ONLY: get_cmd_queues
1413  real(go_wp), intent(in), dimension(:,:), target :: from
1414  type(c_ptr), intent(in) :: to
1415  integer, intent(in) :: startx, starty, nx, ny
1416  logical, intent(in) :: blocking
1417  integer(c_intptr_t) :: cl_mem
1418  INTEGER(c_size_t) :: size_in_bytes, offset_in_bytes
1419  INTEGER(c_intptr_t), pointer :: cmd_queues(:)
1420  integer :: ierr, i
1421 
1422  ! Give the to pointer the appropriate OpenCL memory object type
1423  cl_mem = transfer(to, cl_mem)
1424  cmd_queues => get_cmd_queues()
1425 
1426  ! Two copy strategies depending on how much of the total length
1427  ! nx covers.
1428  if (nx < size(from,1) / 2) then
1429  ! Dispatch asynchronous copies of just the contiguous data.
1430  do i=starty, starty+ny
1431  size_in_bytes = int(nx, 8) * c_sizeof(from(1,1))
1432  offset_in_bytes = int(size(from, 1) * (i-1) + (startx-1)) &
1433  * c_sizeof(from(1,1))
1434  ierr = clEnqueueWriteBuffer( &
1435  cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &
1436  CL_FALSE, offset_in_bytes, size_in_bytes, &
1437  C_LOC(from(startx, i)), 0, C_NULL_PTR, C_NULL_PTR)
1438  CALL check_status("clEnqueueWriteBuffer", ierr)
1439  enddo
1440  if (blocking) then
1441  CALL check_status("clFinish on write", &
1442  clFinish(cmd_queues({self._OCL_MANAGEMENT_QUEUE})))
1443  endif
1444  else
1445  ! Copy across the whole starty:starty+ny rows in a single
1446  ! copy operation.
1447  size_in_bytes = int(size(from,1) * ny, 8) * c_sizeof(from(1,1))
1448  offset_in_bytes = int(size(from,1) * (starty-1)) &
1449  * c_sizeof(from(1,1))
1450  ierr = clEnqueueWriteBuffer(&
1451  cmd_queues({self._OCL_MANAGEMENT_QUEUE}), cl_mem, &
1452  CL_TRUE, offset_in_bytes, size_in_bytes, &
1453  C_LOC(from(1, starty)), 0, C_NULL_PTR, C_NULL_PTR)
1454  CALL check_status("clEnqueueWriteBuffer", ierr)
1455  endif
1456  end subroutine write_sub
1457  '''
1458 
1459  # Obtain the PSyIR representation of the code above
1460  fortran_reader = FortranReader()
1461  container = fortran_reader.psyir_from_source(code)
1462  subroutine = container.children[0]
1463  # Rename subroutine
1464  subroutine.name = subroutine_name
1465 
1466  # Add the subroutine as child of the provided node
1467  node.addchild(subroutine.detach())
1468 
1469  return symtab.lookup_with_tag("ocl_write_func")
1470 
1471  @staticmethod
1472  def _insert_ocl_initialise_buffer(node):
1473  '''
1474  Returns the symbol of a subroutine that initialises an OpenCL buffer in
1475  the OpenCL device using FortCL. If the subroutine doesn't already exist
1476  it also generates it.
1477 
1478  :param node: the container where the new subroutine will be inserted.
1479  :type node: :py:class:`psyclone.psyir.nodes.Container`
1480  :returns: the symbol of the buffer initialisation subroutine.
1481  :rtype: :py:class:`psyclone.psyir.symbols.RoutineSymbol`
1482 
1483  '''
1484  # pylint: disable=too-many-locals
1485  symtab = node.symbol_table
1486  try:
1487  return symtab.lookup_with_tag("ocl_init_buffer_func")
1488  except KeyError:
1489  # If the Symbol does not exist, the rest of this method
1490  # will generate it.
1491  pass
1492 
1493  # Create the symbol for the routine and add it to the symbol table.
1494  subroutine_name = symtab.new_symbol("initialise_device_buffer",
1495  symbol_type=RoutineSymbol,
1496  tag="ocl_init_buffer_func").name
1497 
1498  # Get the GOcean API property names used in this routine
1499  api_config = Config.get().api_conf("gocean1.0")
1500  host_buff = \
1501  api_config.grid_properties["go_grid_data"].fortran.format("field")
1502  props = api_config.grid_properties
1503  num_x = props["go_grid_nx"].fortran.format("field")
1504  num_y = props["go_grid_ny"].fortran.format("field")
1505 
1506  # Fields need to provide a function pointer to how the
1507  # device data is going to be read and written, if it doesn't
1508  # exist, create the appropriate subroutine first.
1509  read_fp = symtab.lookup_with_tag("ocl_read_func").name
1510  write_fp = symtab.lookup_with_tag("ocl_write_func").name
1511 
1512  # Code of the subroutine in Fortran
1513  code = f'''
1514  subroutine initialise_device_buffer(field)
1515  USE fortcl, ONLY: create_rw_buffer
1516  USE iso_c_binding, only: c_size_t
1517  use field_mod
1518  type(r2d_field), intent(inout), target :: field
1519  integer(kind=c_size_t) size_in_bytes
1520  IF (.NOT. field%data_on_device) THEN
1521  size_in_bytes = int({num_x} * {num_y}, 8) * &
1522  c_sizeof({host_buff}(1,1))
1523  ! Create buffer on device, we store it without type information
1524  ! on the dl_esm_inf pointer (transfer/static_cast to void*)
1525  field%device_ptr = transfer( &
1526  create_rw_buffer(size_in_bytes), &
1527  field%device_ptr)
1528  field%data_on_device = .true.
1529  field%read_from_device_f => {read_fp}
1530  field%write_to_device_f => {write_fp}
1531  END IF
1532  end subroutine initialise_device_buffer
1533  '''
1534 
1535  # Obtain the PSyIR representation of the code above
1536  fortran_reader = FortranReader()
1537  container = fortran_reader.psyir_from_source(code)
1538  subroutine = container.children[0]
1539  # Rename subroutine
1540  subroutine.name = subroutine_name
1541 
1542  # Add the subroutine as child of the provided node
1543  node.addchild(subroutine.detach())
1544 
1545  return symtab.lookup_with_tag("ocl_init_buffer_func")
1546 
1547 
1548 # For AutoAPI documentation generation
1549 __all__ = ["GOOpenCLTrans"]
def _add_divisibility_check(node, position, check_status, global_size_expr, local_size)
def _add_ready_check(node, position, check_status, kernel_name, flag, cl_finish, cmd_queue)
def _add_kernel_check(node, position, check_status, kernel_name, flag, cl_finish, cmd_queue)
def validate(self, node, options=None)
Definition: psyGen.py:2799