from ctypes import c_float, c_int32, c_int64, c_ulong, sizeof
import numpy as np
from pyfr.backends.base import (BaseKernelProvider, BaseOrderedMetaKernel,
BasePointwiseKernelProvider,
BaseUnorderedMetaKernel, Kernel)
from pyfr.backends.metal.util import call_
from pyfr.backends.metal.generator import MetalKernelGenerator
from pyfr.nputil import npdtype_to_ctypestype
from pyfr.util import memoize
class MetalKernel(Kernel):
def add_to_graph(self, graph, dnodes):
graph.klist.append(self)
return len(graph.klist)
class _MetalMetaKernel:
def add_to_graph(self, graph, dnodes):
for k in self.kernels:
k.add_to_graph(graph, dnodes)
return len(graph.klist)
class MetalOrderedMetaKernel(_MetalMetaKernel, BaseOrderedMetaKernel): pass
class MetalUnorderedMetaKernel(_MetalMetaKernel, BaseUnorderedMetaKernel): pass
class MetalKernelProvider(BaseKernelProvider):
typemap = [c_float, c_int32, c_int64, c_ulong]
typemap = {k: (k(), sizeof(k)) for k in typemap}
def _benchmark(self, kfunc, nbench=40, nwarmup=25):
cbuf_warmup = self.backend.queue.commandBuffer()
cbuf_bench = self.backend.queue.commandBuffer()
for i in range(nwarmup):
kfunc(cbuf_warmup)
for i in range(nbench):
kfunc(cbuf_bench)
cbuf_warmup.commit()
cbuf_bench.commit()
cbuf_bench.waitUntilCompleted()
return (cbuf_bench.GPUEndTime() - cbuf_bench.GPUStartTime()) / nbench
@memoize
def _build_program(self, src):
from Metal import MTLCompileOptions
# Set the compiler options
opts = MTLCompileOptions.new()
opts.setFastMathEnabled_(True)
# Compile the kernel
lib, err = call_(self.backend.dev, 'newLibraryWith', source=src,
options=opts, error=None)
if err is not None:
raise ValueError(f'Compiler error: {err}')
return lib
def _build_kernel(self, name, src, argtypes, argn=[]):
from Metal import MTLComputePipelineDescriptor, MTLSizeMake
# Build the program
lib = self._build_program(src)
# Fetch the function
func = call_(lib, 'newFunctionWith', name=name)
if func is None:
raise KeyError('Unable to load function {name}')
# Create the pipeline descriptor
desc = MTLComputePipelineDescriptor.alloc().init()
desc.setComputeFunction_(func)
desc.setThreadGroupSizeIsMultipleOfThreadExecutionWidth_(True)
# Obtain the corresponding compute pipeline
cpsf = call_(self.backend.dev, 'newComputePipelineStateWith',
descriptor=desc, error=None)
if cpsf is None:
raise RuntimeError('Unable to create compute pipeline state')
# Classify the arguments as either pointers or scalars
pargs, sargs = [], []
for i, argt in enumerate(argtypes):
if argt == np.uintp:
pargs.append(i)
else:
ctype = npdtype_to_ctypestype(argt)
sargs.append((i, ctype(), sizeof(ctype)))
def encode(cbuf, grid, tgrp, *args):
cce = cbuf.computeCommandEncoder()
cce.setComputePipelineState_(cpsf)
for i in pargs:
buf, off = args[i]
cce.setBuffer_offset_atIndex_(*args[i], i)
for i, val, sz in sargs:
val.value = args[i]
cce.setBytes_length_atIndex_(val, sz, i)
cce.dispatchThreads_threadsPerThreadgroup_(MTLSizeMake(*grid),
MTLSizeMake(*tgrp))
cce.endEncoding()
return encode