Sparse Gradient

Hi,

is it possible to use sparse gradients in pymc? The theano.sparse.structured_dot implements a structured gradient, returning only the non-zero derivatives. However, pymc seems to create a dense matrix to initialize it:

import numpy as np
import scipy.sparse as sp
import theano
import pymc3 as pm

N = 1_000_000
density = 1e-7

rng = np.random.default_rng()
sX = sp.random(N, N, density=density, format='csr', data_rvs=np.ones, random_state=rng, dtype=int)
tX = theano.sparse.basic.as_sparse_variable(sX)

y = np.random.normal(size=N)

with pm.Model() as model:
    x = pm.Normal('x', shape=(N, 1))
    pm.Normal('y', theano.sparse.structured_dot(tX, x).flatten(), observed=y)
    pm.find_MAP()

If less then 7 TB memory are available, this fails with:

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
Input In [93], in <module>
     16 x = pm.Normal('x', shape=(N, 1))
     17 pm.Normal('y', theano.sparse.structured_dot(tX, x).flatten(), observed=y)
---> 18 pm.find_MAP()

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/pymc3/tuning/starting.py:114, in find_MAP(start, vars, method, return_raw, include_transformed, progressbar, maxeval, model, *args, **kwargs)
    111 x0 = bij.map(start)
    113 try:
--> 114     dlogp_func = bij.mapf(model.fastdlogp_nojac(vars))
    115     compute_gradient = True
    116 except (AttributeError, NotImplementedError, tg.NullTypeGradError):

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/pymc3/model.py:462, in Factor.fastdlogp_nojac(self, vars)
    460 def fastdlogp_nojac(self, vars=None):
    461     """Compiled log density gradient function, without jacobian terms."""
--> 462     return self.model.fastfn(gradient(self.logp_nojact, vars))

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/pymc3/theanof.py:134, in gradient(f, vars)
    131     vars = cont_inputs(f)
    133 if vars:
--> 134     return tt.concatenate([gradient1(f, v) for v in vars], axis=0)
    135 else:
    136     return empty_gradient

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/pymc3/theanof.py:134, in <listcomp>(.0)
    131     vars = cont_inputs(f)
    133 if vars:
--> 134     return tt.concatenate([gradient1(f, v) for v in vars], axis=0)
    135 else:
    136     return empty_gradient

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/pymc3/theanof.py:123, in gradient1(f, v)
    121 def gradient1(f, v):
    122     """flat gradient of f wrt v"""
--> 123     return tt.flatten(tt.grad(f, v, disconnected_inputs="warn"))

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/gradient.py:639, in grad(cost, wrt, consider_constant, disconnected_inputs, add_names, known_grads, return_disconnected, null_gradients)
    636     if hasattr(g.type, "dtype"):
    637         assert g.type.dtype in theano.tensor.float_dtypes
--> 639 rval = _populate_grad_dict(var_to_app_to_idx, grad_dict, wrt, cost_name)
    641 for i in range(len(rval)):
    642     if isinstance(rval[i].type, NullType):

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/gradient.py:1440, in _populate_grad_dict(var_to_app_to_idx, grad_dict, wrt, cost_name)
   1437     # end if cache miss
   1438     return grad_dict[var]
-> 1440 rval = [access_grad_cache(elem) for elem in wrt]
   1442 return rval

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/gradient.py:1440, in <listcomp>(.0)
   1437     # end if cache miss
   1438     return grad_dict[var]
-> 1440 rval = [access_grad_cache(elem) for elem in wrt]
   1442 return rval

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/gradient.py:1393, in _populate_grad_dict.<locals>.access_grad_cache(var)
   1390 for node in node_to_idx:
   1391     for idx in node_to_idx[node]:
-> 1393         term = access_term_cache(node)[idx]
   1395         if not isinstance(term, Variable):
   1396             raise TypeError(
   1397                 f"{node.op}.grad returned {type(term)}, expected"
   1398                 " Variable instance."
   1399             )

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/gradient.py:1297, in _populate_grad_dict.<locals>.access_term_cache(node)
   1284     raise TypeError(
   1285         (
   1286             f"{node.op}.grad returned None for a gradient term, "
   (...)
   1292         )
   1293     )
   1295 # Check that the gradient term for this input
   1296 # has the right shape
-> 1297 if hasattr(term, "shape"):
   1298     orig_ipt = inputs[i]
   1299     for orig_ipt_v, term_v in get_test_values(orig_ipt, term):

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/sparse/basic.py:290, in _sparse_py_operators.<lambda>(self)
    287 def toarray(self):
    288     return dense_from_sparse(self)
--> 290 shape = property(lambda self: tensor.shape(dense_from_sparse(self)))
    291 # don't worry!
    292 # the plan is that the ShapeFeature in tensor.opt will do shape propagation
    293 # and remove the dense_from_sparse from the graph.  This will *NOT*
    294 # actually expand your sparse matrix just to get the shape.
    295 ndim = property(lambda self: self.type.ndim)

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/graph/op.py:253, in Op.__call__(self, *inputs, **kwargs)
    250 node = self.make_node(*inputs, **kwargs)
    252 if config.compute_test_value != "off":
--> 253     compute_test_value(node)
    255 if self.default_output is not None:
    256     rval = node.outputs[self.default_output]

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/graph/op.py:130, in compute_test_value(node)
    127 thunk.inputs = [storage_map[v] for v in node.inputs]
    128 thunk.outputs = [storage_map[v] for v in node.outputs]
--> 130 required = thunk()
    131 assert not required  # We provided all inputs
    133 for output in node.outputs:
    134     # Check that the output has been computed

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/graph/op.py:476, in Op.make_py_thunk.<locals>.rval(p, i, o, n)
    475 def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
--> 476     r = p(n, [x[0] for x in i], o)
    477     for o in node.outputs:
    478         compute_map[o][0] = True

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/sparse/basic.py:915, in DenseFromSparse.perform(self, node, inputs, outputs)
    913     out[0] = x
    914 else:
--> 915     out[0] = x.toarray()
    916 assert _is_dense(out[0])

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/scipy/sparse/_compressed.py:1051, in _cs_matrix.toarray(self, order, out)
   1049 if out is None and order is None:
   1050     order = self._swap('cf')[0]
-> 1051 out = self._process_toarray_args(order, out)
   1052 if not (out.flags.c_contiguous or out.flags.f_contiguous):
   1053     raise ValueError('Output array must be C or F contiguous')

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/scipy/sparse/_base.py:1288, in spmatrix._process_toarray_args(self, order, out)
   1286     return out
   1287 else:
-> 1288     return np.zeros(self.shape, dtype=self.dtype, order=order)

MemoryError: Unable to allocate 7.28 TiB for an array with shape (1000000, 1000000) and data type float64

Is there a way to avoid storing the dense N x N matrix? Should I post this as feature request to GitHub?

I was able to run your code as-is with the MAP optimization finishing in 2 seconds on a 2020 Macbook Pro with PyMC3 version 3.11.4, SciPy version 1.7.3 and Theano version 1.1.2. I noticed that this Github issue appears to be related and mentions that computing test values may trigger the dense computation. However, I was able to run the script successfully with the “raise” setting for theano.config.compute_test_value which should have reproduced your problem.

Hi @ckrapu, thank you very much for taking a look at this.

I saw the Github issue but felt like mine is slightly different. For me the error above is triggered even if I explicitly set theano.config.compute_test_value = 'off' and I profiled the memory use of the script for N = 10_000 to see that the memory apparently remains occupied throughout the MAP optimization:

My version are:
Python: 3.9
Scipy 1.8.0
Theano: 1.1.2
PyMC: 3.11.4
Ubuntu: 18.04.5 LTS
and I removed my ~/.theanorc.

However, I did try the script on a 2019 MacBook Pro now and indeed, the script finished successfully. The peak memory usage for N=10_000 was only 561.73 MiB and for N=1_000_000 it was only 1234.56 MiB. Additionally, looking at the traceback above shows that config.compute_test_value is evaluated

File ~/.conda/envs/pymc3_nv1/lib/python3.9/site-packages/theano/graph/op.py:253, in Op.__call__(self, *inputs, **kwargs)
    250 node = self.make_node(*inputs, **kwargs)
    252 if config.compute_test_value != "off":
--> 253     compute_test_value(node)
    255 if self.default_output is not None:
    256     rval = node.outputs[self.default_output]

and plugging in some debugging code reveals that config.compute_test_value == 'raise' at this time.

Why does the script use so much more memory on linux?

I’m working with Dominik on this project; if it would be useful, I’d like to highlight that the failure point is accessing the shape attribute of the sparse matrix. This error can even by reproduced in Theano by:

import numpy as np
import scipy.sparse as sp
import theano
import theano.sparse

N = 1_000_000
density = 1e-7
theano.config.compute_test_value = 'raise'

rng = np.random.default_rng()
sX = sp.random(N, N, density=density, format='csr', data_rvs=np.ones, random_state=rng, dtype=int)
tX = theano.sparse.basic.as_sparse(sX)
tX.shape
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
/loc/scratch/48760635/ipykernel_36364/1210979029.py in <module>
     11 sX = sp.random(N, N, density=density, format='csr', data_rvs=np.ones, random_state=rng, dtype=int)
     12 tX = theano.sparse.basic.as_sparse(sX)
---> 13 tX.shape

~/.conda/envs/env_a/lib/python3.8/site-packages/theano/sparse/basic.py in <lambda>(self)
    288         return dense_from_sparse(self)
    289 
--> 290     shape = property(lambda self: tensor.shape(dense_from_sparse(self)))
    291     # don't worry!
    292     # the plan is that the ShapeFeature in tensor.opt will do shape propagation

~/.conda/envs/env_a/lib/python3.8/site-packages/theano/graph/op.py in __call__(self, *inputs, **kwargs)
    251 
    252         if config.compute_test_value != "off":
--> 253             compute_test_value(node)
    254 
    255         if self.default_output is not None:

~/.conda/envs/env_a/lib/python3.8/site-packages/theano/graph/op.py in compute_test_value(node)
    128     thunk.outputs = [storage_map[v] for v in node.outputs]
    129 
--> 130     required = thunk()
    131     assert not required  # We provided all inputs
    132 

~/.conda/envs/env_a/lib/python3.8/site-packages/theano/graph/op.py in rval(p, i, o, n)
    474             # default arguments are stored in the closure of `rval`
    475             def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
--> 476                 r = p(n, [x[0] for x in i], o)
    477                 for o in node.outputs:
    478                     compute_map[o][0] = True

~/.conda/envs/env_a/lib/python3.8/site-packages/theano/sparse/basic.py in perform(self, node, inputs, outputs)
    913             out[0] = x
    914         else:
--> 915             out[0] = x.toarray()
    916         assert _is_dense(out[0])
    917 

~/.conda/envs/env_a/lib/python3.8/site-packages/scipy/sparse/compressed.py in toarray(self, order, out)
   1037         if out is None and order is None:
   1038             order = self._swap('cf')[0]
-> 1039         out = self._process_toarray_args(order, out)
   1040         if not (out.flags.c_contiguous or out.flags.f_contiguous):
   1041             raise ValueError('Output array must be C or F contiguous')

~/.conda/envs/env_a/lib/python3.8/site-packages/scipy/sparse/base.py in _process_toarray_args(self, order, out)
   1200             return out
   1201         else:
-> 1202             return np.zeros(self.shape, dtype=self.dtype, order=order)
   1203 
   1204 

MemoryError: Unable to allocate 7.28 TiB for an array with shape (1000000, 1000000) and data type int64

In the traceback, which is equivalent to the second half of the traceback above, there is a comment that explains that the shape of a sparse matrix is accessed by creating the dense matrix and then returning the shape of the dense matrix, but the ShapedFeature optimizer is meant to replace this call. So the error is that in tt.grad, which is called by gradient1, the shape of the sparse matrix is accessed in a way that doesn’t provide an opportunity for the ShapedFeature optimizer to replace the call to dense_from_sparse, leading to the creation of a n x n zeros matrix, just to access the shape.

These are interesting findings but also rather confusing due to (1) the discrepancy between running on Mac and Linux and (2) how this problem is very clearly supposed to be avoided in the code, but the wrong pathway gets triggered. It seems like it would be a really good candidate for further investigation and potentially a PR for Aesara, which is the up-to-date successor of Theano currently being developed.