I intend to use my GPU to calculate a custom theano operation with a custom gradient. Since I did not find a way yet to use gpuarray library that comes with theano I use pycuda and skcuda on a separate GPU. Testing the custom theano operation outside of PyMC3 works fine and using numpy on the CPU instead of pycuda also leads to the expected results of PyMC3. However, when I use the pycuda I get the error
LogicError: cuFuncSetBlockShape failed: invalid resource handle
I presume this is due to the Cuda context not being available in the PyMC3 sampling thread. I might have to reinitialize it but how could I achieve this? Here is a small example that reproduces the crash. In order to run it on a single GPU, I recommend setting the environmentvariable THEANO_FLAGS='floatX=float32,device=cuda0,gpuarray.preallocate=0.5'
and remove the line cudart.cudaSetDevice(1)
.
import numpy as np
import pandas as pd
import pymc3 as pm
import theano
import pycuda.autoinit
from pycuda import gpuarray, cumath
from skcuda import linalg, misc, cudart
from theano.tensor import as_tensor_variable
from theano.gof import Op, Apply
cudart.cudaSetDevice(1)
linalg.init()
class apotential(Op):
def make_node(self, *inputs):
ll = as_tensor_variable(inputs[0])
return Apply(self, [ll], [ll.type()])
def perform(self, node, inputs, output):
alpha = gpuarray.to_gpu(inputs[0])
print("eval")
# some costly calculation
result = misc.multiply(alpha, alpha)
output[0][0] = -1 * result.get()
def __init__(self, a):
a = np.array(a).astype(theano.config.floatX)
self.a = gpuarray.to_gpu(a)
self._mgrad = potentialgrad(a)
super(apotential, self).__init__()
def grad(self, inputs, g):
grads = self._mgrad(inputs[0])
return [g[0] * grads]
class potentialgrad(theano.Op):
def make_node(self, *inputs):
ll = as_tensor_variable(inputs[0])
return Apply(self, [ll], [ll.type()])
def perform(self, node, inputs, output):
alpha = gpuarray.to_gpu(inputs[0])
print("grad")
# some costly calculation
factor = np.array(-2).astype(theano.config.floatX)
result = misc.multiply(alpha, factor)
output[0][0] = result.get()
def __init__(self, a):
self.a = a
super(potentialgrad, self).__init__()
apo = apotential(2)
with pm.Model() as model:
alpha = pm.Uniform('alpha')
beta = pm.Potential('beta', apo(alpha))
trace = pm.sample(chains = 1)
And the output is
eval
---------------------------------------------------------------------------
LogicError Traceback (most recent call last)
<ipython-input-16-a4143f1774e4> in <module>
2 with pm.Model() as model:
3 alpha = pm.Uniform('alpha')
----> 4 beta = pm.Potential('beta', apo(alpha))
5 trace = pm.sample(chains = 1)
~/Envs/noteEnv4/lib/python3.6/site-packages/theano/gof/op.py in __call__(self, *inputs, **kwargs)
672 thunk.outputs = [storage_map[v] for v in node.outputs]
673
--> 674 required = thunk()
675 assert not required # We provided all inputs
676
~/Envs/noteEnv4/lib/python3.6/site-packages/theano/gof/op.py in rval(p, i, o, n)
890 # default arguments are stored in the closure of `rval`
891 def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
--> 892 r = p(n, [x[0] for x in i], o)
893 for o in node.outputs:
894 compute_map[o][0] = True
<ipython-input-15-d7a1127a650c> in perform(self, node, inputs, output)
7 alpha = gpuarray.to_gpu(inputs[0])
8 print("eval")
----> 9 result = misc.multiply(alpha, alpha)
10 output[0][0] = -1 * result.get()
11
~/Envs/noteEnv4/lib/python3.6/site-packages/skcuda/misc.py in multiply(x_gpu, y_gpu)
1079 """
1080
-> 1081 return binaryop_2d("*", operator.mul, True, x_gpu, y_gpu)
1082
1083 def divide(x_gpu, y_gpu):
~/Envs/noteEnv4/lib/python3.6/site-packages/skcuda/misc.py in binaryop_2d(c_op, py_op, commutative, x_gpu, y_gpu)
980
981 if x_gpu.shape == y_gpu.shape:
--> 982 return py_op(x_gpu, y_gpu)
983 elif x_gpu.size == 1:
984 return py_op(x_gpu.get().reshape(()), y_gpu)
~/Envs/noteEnv4/lib/python3.6/site-packages/pycuda/gpuarray.py in __mul__(self, other)
486 if isinstance(other, GPUArray):
487 result = self._new_like_me(_get_common_dtype(self, other))
--> 488 return self._elwise_multiply(other, result)
489 else:
490 result = self._new_like_me(_get_common_dtype(self, other))
~/Envs/noteEnv4/lib/python3.6/site-packages/pycuda/gpuarray.py in _elwise_multiply(self, other, out, stream)
372 func.prepared_async_call(self._grid, self._block, stream,
373 self.gpudata, other.gpudata,
--> 374 out.gpudata, self.mem_size)
375
376 return out
~/Envs/noteEnv4/lib/python3.6/site-packages/pycuda/driver.py in function_prepared_async_call(func, grid, block, stream, *args, **kwargs)
493 def function_prepared_async_call(func, grid, block, stream, *args, **kwargs):
494 if isinstance(block, tuple):
--> 495 func._set_block_shape(*block)
496 else:
497 from warnings import warn
LogicError: cuFuncSetBlockShape failed: invalid resource handle
I also tried to running gpuarray.to_gpu
of the init
method only in the make_node
methods but the error does not change.
Can I disable the multithreading of PyMC3 entirely? Am I doing something else wrong? Or is there a better way to use the GPU for matrix calculations in pymc3?