Chain failed due to mass matrix going to zero in matrix normal distribution

I am trying to run multivariate linear regression using matrix normal distribution. I am facing the problem of a mass matrix contains zero on the diagonal. In addition, there are other warnings when I run the code, I am not sure is it because of the code or theano version. Code takes a lot of time given I am running on very small data. Below is the code-

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cmap
cm = cmap.inferno

import numpy as np
import scipy as sp
import theano
import theano.tensor as tt
import theano.tensor.nlinalg
import sys
sys.path.insert(0, "../../..")
import pymc3 as pm

from sklearn import preprocessing
# simulated data
np.random.seed(20090425)
n = 30
p = 2
p_dash = 2
q= 3
y = np.vstack((np.random.normal(0.1, 1, (20,p)),np.random.normal(0.5, 2, (10,p))))
X = np.vstack((np.random.normal(1, 1, (20,p_dash)),np.random.normal(2, 2, (10,p_dash))))
#X = np.hstack((X,np.ones(n)))

B = np.random.normal(0.2,2,size=(p_dash,p))
y = np.matmul(X,B) 
y = preprocessing.scale(y)

from pymc3 import (
    NUTS,
    Deterministic,
    HalfCauchy,
    Model,
    MvNormal,
    find_MAP,
    sample,
    summary,
    traceplot,
)
import theano.tensor as T

true_scale = 3
from theano.tensor import _shared

with pm.Model() as model:

   
    mu = np.zeros(n)

    tau_sq = pm.TruncatedNormal('tau_sq', mu=0, sigma=30, lower=0)
    
    # beta distribution
    # Setup right cholesky matrix
    sd_dist_beta = pm.HalfCauchy.dist(beta=10, shape=p)
    colchol_packed_beta = pm.LKJCholeskyCov('colcholpacked_beta', n=p, eta=2, sd_dist=sd_dist_beta)
    colchol_beta = pm.expand_packed_triangular(p, colchol_packed_beta)

    # Setup left covariance matrix
    rowchol_packed_beta = pm.LKJCholeskyCov('rowcholpacked_beta', n=p_dash, eta=2, sd_dist=sd_dist_beta)
    rowchol_beta = pm.expand_packed_triangular(p_dash, colchol_packed_beta)    
        
    mu_beta = pm.MvNormal('mu_beta',mu=np.zeros(p),cov= np.eye(p)*tau_sq, shape=(p_dash,p))
    beta = pm.MatrixNormal("beta", mu=mu_beta, colchol=colchol_beta, rowchol=rowchol_beta, shape=(p_dash,p))
    

    sd_dist_y = pm.HalfCauchy.dist(beta=7, shape=p)
    colchol_packed = pm.LKJCholeskyCov('colcholpacked', n=p, eta=2,sd_dist=sd_dist_y)
    colchol = pm.expand_packed_triangular(p, colchol_packed)
    radon_est = pm.math.dot(X,beta) 
    y_obs = pm.MatrixNormal("y_obs", mu=radon_est, colchol=colchol, rowcov=np.eye(n),observed=y,shape=(n,p))
    
with model:
    trace = pm.sample(1000, tune=2000)

Below is the error-

/home/dushyant/anaconda3/lib/python3.7/site-packages/pymc3/sampling.py:468: FutureWarning: In an upcoming release, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning.
  FutureWarning,
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
ERROR (theano.gof.opt): SeqOptimizer apply <theano.tensor.opt.FusionOptimizer object at 0x7f318b51a7d0>
ERROR (theano.gof.opt): Traceback:
ERROR (theano.gof.opt): Traceback (most recent call last):
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/theano/scalar/basic.py", line 358, in filter
    " (%s) and allow_downcast is not True" % self.dtype
TypeError: Value cannot accurately be converted to dtype (float64) and allow_downcast is not True

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/theano/gof/opt.py", line 249, in apply
    sub_prof = optimizer.optimize(fgraph)
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/theano/gof/opt.py", line 89, in optimize
    ret = self.apply(fgraph, *args, **kwargs)
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/theano/tensor/opt.py", line 7774, in apply
    new_outputs = self.optimizer(node)
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/theano/tensor/opt.py", line 7596, in local_fuse
    tmp.tag.test_value = tv.flatten()[0]
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/theano/gof/utils.py", line 277, in __setattr__
    obj = self.attr_filter(obj)
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/theano/scalar/basic.py", line 364, in filter
    e,
TypeError: ("Could not convert <class 'numpy.float64'> (value=nan) to float64", TypeError('Value cannot accurately be converted to dtype (float64) and allow_downcast is not True'))

Multiprocess sampling (4 chains in 4 jobs)
NUTS: [colcholpacked, beta, mu_beta, rowcholpacked_beta, colcholpacked_beta, tau_sq]

 24.47% [2936/12000 09:57<30:45 Sampling 4 chains, 0 divergences]
---------------------------------------------------------------------------
RemoteTraceback                           Traceback (most recent call last)
RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py", line 138, in run
    self._start_loop()
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py", line 192, in _start_loop
    point, stats = self._compute_point()
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py", line 217, in _compute_point
    point, stats = self._step_method.step(self._point)
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/pymc3/step_methods/arraystep.py", line 273, in step
    apoint, stats = self.astep(array)
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/pymc3/step_methods/hmc/base_hmc.py", line 148, in astep
    self.potential.raise_ok(self._logp_dlogp_func._ordering.vmap)
  File "/home/dushyant/anaconda3/lib/python3.7/site-packages/pymc3/step_methods/hmc/quadpotential.py", line 272, in raise_ok
    raise ValueError("\n".join(errmsg))
ValueError: Mass matrix contains zeros on the diagonal. 
The derivative of RV `beta`.ravel()[3] is zero.
"""

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py in run()
    137             self._point = self._make_numpy_refs()
--> 138             self._start_loop()
    139         except KeyboardInterrupt:

~/anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py in _start_loop()
    191                 try:
--> 192                     point, stats = self._compute_point()
    193                 except SamplingError as e:

~/anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py in _compute_point()
    216         if self._step_method.generates_stats:
--> 217             point, stats = self._step_method.step(self._point)
    218         else:

~/anaconda3/lib/python3.7/site-packages/pymc3/step_methods/arraystep.py in step()
    272         if self.generates_stats:
--> 273             apoint, stats = self.astep(array)
    274             point = self._logp_dlogp_func.array_to_full_dict(apoint)

~/anaconda3/lib/python3.7/site-packages/pymc3/step_methods/hmc/base_hmc.py in astep()
    147             ]
--> 148             self.potential.raise_ok(self._logp_dlogp_func._ordering.vmap)
    149             message_energy = (

~/anaconda3/lib/python3.7/site-packages/pymc3/step_methods/hmc/quadpotential.py in raise_ok()
    271                 )
--> 272             raise ValueError("\n".join(errmsg))
    273 

ValueError: Mass matrix contains zeros on the diagonal. 
The derivative of RV `beta`.ravel()[3] is zero.

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
<ipython-input-4-0e31cfaf8a73> in <module>
     83 
     84 with model:
---> 85     trace = pm.sample(1000, tune=2000)

~/anaconda3/lib/python3.7/site-packages/pymc3/sampling.py in sample(draws, step, init, n_init, start, trace, chain_idx, chains, cores, tune, progressbar, model, random_seed, discard_tuned_samples, compute_convergence_checks, callback, jitter_max_retries, return_inferencedata, idata_kwargs, mp_ctx, pickle_backend, **kwargs)
    555         _print_step_hierarchy(step)
    556         try:
--> 557             trace = _mp_sample(**sample_args, **parallel_args)
    558         except pickle.PickleError:
    559             _log.warning("Could not pickle model, sampling singlethreaded.")

~/anaconda3/lib/python3.7/site-packages/pymc3/sampling.py in _mp_sample(draws, tune, step, chains, cores, chain, random_seed, start, progressbar, trace, model, callback, discard_tuned_samples, mp_ctx, pickle_backend, **kwargs)
   1474         try:
   1475             with sampler:
-> 1476                 for draw in sampler:
   1477                     trace = traces[draw.chain - chain]
   1478                     if trace.supports_sampler_stats and draw.stats is not None:

~/anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py in __iter__(self)
    478 
    479         while self._active:
--> 480             draw = ProcessAdapter.recv_draw(self._active)
    481             proc, is_last, draw, tuning, stats, warns = draw
    482             self._total_draws += 1

~/anaconda3/lib/python3.7/site-packages/pymc3/parallel_sampling.py in recv_draw(processes, timeout)
    358             else:
    359                 error = RuntimeError("Chain %s failed." % proc.chain)
--> 360             raise error from old_error
    361         elif msg[0] == "writing_done":
    362             proc._readable = True

I think I have correctly given prior and everything, but I am not sure what is wrong here. Also, is matrix normal a good prior on beta when performing Bayesian multivariate regression? I feel like I have overcomplicated things.

Which version of PyMC3 / Theano are you using? Could you use watermark to provide the version numbers?

I am using pyMC3 version 3.10.0, Theano version 1.0.5 and Theano-PyMC version 1.0.11