Using pycuda in theano operation

I intend to use my GPU to calculate a custom theano operation with a custom gradient. Since I did not find a way yet to use gpuarray library that comes with theano I use pycuda and skcuda on a separate GPU. Testing the custom theano operation outside of PyMC3 works fine and using numpy on the CPU instead of pycuda also leads to the expected results of PyMC3. However, when I use the pycuda I get the error

LogicError: cuFuncSetBlockShape failed: invalid resource handle

I presume this is due to the Cuda context not being available in the PyMC3 sampling thread. I might have to reinitialize it but how could I achieve this? Here is a small example that reproduces the crash. In order to run it on a single GPU, I recommend setting the environmentvariable THEANO_FLAGS='floatX=float32,device=cuda0,gpuarray.preallocate=0.5' and remove the line cudart.cudaSetDevice(1).

import numpy as np
import pandas as pd
import pymc3 as pm
import theano
import pycuda.autoinit
from pycuda import gpuarray, cumath
from skcuda import linalg, misc, cudart
from theano.tensor import as_tensor_variable
from theano.gof import Op, Apply
cudart.cudaSetDevice(1)
linalg.init()

class apotential(Op):
    def make_node(self, *inputs):
        ll = as_tensor_variable(inputs[0])
        return Apply(self, [ll], [ll.type()])

    def perform(self, node, inputs, output):
        alpha = gpuarray.to_gpu(inputs[0])
        print("eval")
        # some costly calculation
        result = misc.multiply(alpha, alpha)
        output[0][0] = -1 * result.get()

    def __init__(self, a):
        a = np.array(a).astype(theano.config.floatX)
        self.a = gpuarray.to_gpu(a)
        self._mgrad = potentialgrad(a)
        super(apotential, self).__init__()

    def grad(self, inputs, g):
        grads = self._mgrad(inputs[0])
        return [g[0] * grads]
   
class potentialgrad(theano.Op):
    def make_node(self, *inputs):
        ll = as_tensor_variable(inputs[0])
        return Apply(self, [ll], [ll.type()])

    def perform(self, node, inputs, output):
        alpha = gpuarray.to_gpu(inputs[0])
        print("grad")
        # some costly calculation
        factor = np.array(-2).astype(theano.config.floatX)
        result = misc.multiply(alpha, factor)
        output[0][0] = result.get()
        
    def __init__(self, a):
        self.a = a
        super(potentialgrad, self).__init__()

apo = apotential(2)
with pm.Model() as model:
    alpha = pm.Uniform('alpha')
    beta = pm.Potential('beta', apo(alpha))
    trace = pm.sample(chains = 1)

And the output is

eval
---------------------------------------------------------------------------
LogicError                                Traceback (most recent call last)
<ipython-input-16-a4143f1774e4> in <module>
      2 with pm.Model() as model:
      3     alpha = pm.Uniform('alpha')
----> 4     beta = pm.Potential('beta', apo(alpha))
      5     trace = pm.sample(chains = 1)

~/Envs/noteEnv4/lib/python3.6/site-packages/theano/gof/op.py in __call__(self, *inputs, **kwargs)
    672                 thunk.outputs = [storage_map[v] for v in node.outputs]
    673 
--> 674                 required = thunk()
    675                 assert not required  # We provided all inputs
    676 

~/Envs/noteEnv4/lib/python3.6/site-packages/theano/gof/op.py in rval(p, i, o, n)
    890             # default arguments are stored in the closure of `rval`
    891             def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
--> 892                 r = p(n, [x[0] for x in i], o)
    893                 for o in node.outputs:
    894                     compute_map[o][0] = True

<ipython-input-15-d7a1127a650c> in perform(self, node, inputs, output)
      7         alpha = gpuarray.to_gpu(inputs[0])
      8         print("eval")
----> 9         result = misc.multiply(alpha, alpha)
     10         output[0][0] = -1 * result.get()
     11 

~/Envs/noteEnv4/lib/python3.6/site-packages/skcuda/misc.py in multiply(x_gpu, y_gpu)
   1079     """
   1080 
-> 1081     return binaryop_2d("*", operator.mul, True, x_gpu, y_gpu)
   1082 
   1083 def divide(x_gpu, y_gpu):

~/Envs/noteEnv4/lib/python3.6/site-packages/skcuda/misc.py in binaryop_2d(c_op, py_op, commutative, x_gpu, y_gpu)
    980 
    981     if x_gpu.shape == y_gpu.shape:
--> 982         return py_op(x_gpu, y_gpu)
    983     elif x_gpu.size == 1:
    984         return py_op(x_gpu.get().reshape(()), y_gpu)

~/Envs/noteEnv4/lib/python3.6/site-packages/pycuda/gpuarray.py in __mul__(self, other)
    486         if isinstance(other, GPUArray):
    487             result = self._new_like_me(_get_common_dtype(self, other))
--> 488             return self._elwise_multiply(other, result)
    489         else:
    490             result = self._new_like_me(_get_common_dtype(self, other))

~/Envs/noteEnv4/lib/python3.6/site-packages/pycuda/gpuarray.py in _elwise_multiply(self, other, out, stream)
    372         func.prepared_async_call(self._grid, self._block, stream,
    373                 self.gpudata, other.gpudata,
--> 374                 out.gpudata, self.mem_size)
    375 
    376         return out

~/Envs/noteEnv4/lib/python3.6/site-packages/pycuda/driver.py in function_prepared_async_call(func, grid, block, stream, *args, **kwargs)
    493     def function_prepared_async_call(func, grid, block, stream, *args, **kwargs):
    494         if isinstance(block, tuple):
--> 495             func._set_block_shape(*block)
    496         else:
    497             from warnings import warn

LogicError: cuFuncSetBlockShape failed: invalid resource handle

I also tried to running gpuarray.to_gpu of the init method only in the make_node methods but the error does not change.

Can I disable the multithreading of PyMC3 entirely? Am I doing something else wrong? Or is there a better way to use the GPU for matrix calculations in pymc3?

1 Like

It’s good that someone is attempting the task of parallelizing Pymc3 unto a GPU.

This is one of the true major challenges with Bayesian modelling, that it is slow, so serious speed gains would be more than worthwhile. However, parallelizing MCMC is hard, and it seems that progress on this front has stopped, with the latest discussion hosted here (where it seemed like someone has already figured it out, and would only have to share the code?) - https://github.com/pymc-devs/pymc3/issues/3341

I’m certain there is a way to do this with Pymc3’s current engine, Theano, but Pymc4 is already somewhat in development and based on Google’s tech, and so when that comes out, it will probably be more parallelized than it is now (if you want to be patient and wait ^^).

I found out that one has to use the make_thunk method of the Theano operation. The current implementation diverged a little from the documented example but thanks to a comprehensive error message it was easy to adjust.

The given example above can be fixed with:

import numpy as np
import pandas as pd
import pymc3 as pm
import theano
import pycuda.autoinit
from pycuda import gpuarray, cumath
from skcuda import linalg, misc, cudart
from theano.tensor import as_tensor_variable
from theano.gof import Op, Apply
linalg.init()

class apotential(Op):
    def make_node(self, *inputs):
        ll = as_tensor_variable(inputs)
        return Apply(self, [ll], [ll.type()])

    def make_thunk(self, node, storage_map, compute_map, rem=None, impl=None, no_recycling=[]):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]
        a = gpuarray.to_gpu(self.a)
        def thunk():
            alpha = gpuarray.to_gpu(np.asarray(inputs[0]))
            print("eval")
            result = misc.multiply(alpha, alpha)
            outputs[0][0] = -1 * result.get()[0]
            for v in node.outputs:
                compute_map[v][0] = True
        return thunk

    def __init__(self, a):
        a = np.array(a).astype(theano.config.floatX)
        self.a = gpuarray.to_gpu(a)
        self._mgrad = potentialgrad(a)
        super(apotential, self).__init__()

    def grad(self, inputs, g):
        grads = self._mgrad(inputs[0])
        return [g[0] * grads]
    
class potentialgrad(theano.Op):
    def make_node(self, *inputs):
        ll = as_tensor_variable(inputs)
        return Apply(self, [ll], [ll.type()])

        
    def make_thunk(self, node, storage_map, compute_map, rem=None, impl=None, no_recycling=[]):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]
        a = gpuarray.to_gpu(self.a)
        def thunk():
            alpha = gpuarray.to_gpu(np.asarray(inputs[0]))
            print("grad")
            factor = np.array(-2).astype(theano.config.floatX)
            result = misc.multiply(alpha, factor)
            
            outputs[0][0] = result.get()[0]
            for v in node.outputs:
                compute_map[v][0] = True
        return thunk
        
    def __init__(self, a):
        self.a = a
        super(potentialgrad, self).__init__()

apo = apotential(2)
with pm.Model() as model:
    alpha = pm.Uniform('alpha')
    beta = pm.Potential('beta', apo(alpha))
    trace = pm.sample(chains = 1)

However, one has to have Theano use the CPU, or the passed input already resides on the GPU. One would probably have to load it into CPU memory and then into the GPU memory of the “new” Pycuda instance. I just used the Theano-flags floatX=float32,device=cpu. There is probably a better way to use the GPU for a custom Theano-operation/PyMC3-potential with custom gradient altogether.