My aim with a small project is to see whether it’s possible to estimate covariances as part of mixture model parameters. I can set up a simulation for the data without problem:
CELL_TYPES = ['Neuron', 'Astroctye', 'Microglia', 'Oligodendrocyte', 'Endothelial']
CELL_TYPE_PROPORTIONS = [0.3, 0.25, 0.12, 0.18, 0.05]
TRUE_MEAN_MAT = np.array([[3., 1.5, 1., 0.3, 0.],
[1.5, 2., 0.5, 0.1, 0.]])
COV_MATS = [[[2.0, 1.7],
[1.7, 2.0]],
[[2.5, -1.8],
[-1.8, 2.0]],
[[0.4, 0.],
[0., 0.4]],
[[0.3, 0.12],
[0.12, 0.5]],
[[1.0, -0.3],
[-0.3, 0.7]]]
COV_MATS = [np.array(c) for c in COV_MATS]
MAT_I = np.array([[1., 0.], [0., 1.]])
N_OBS = 100
MAT_I_N = np.diag(np.ones(N_OBS,dtype=float))
with pm.Model() as model_true:
# define the true parameters
# proportion is going to be a dirichlet -- this is needed to actually perform sampling
a0 = 50. # low variance on the type proportions
cell_mix = pm.Dirichlet('cell_proportions', a0 * np.array(CELL_TYPE_PROPORTIONS))
# define the cell expression of the 2 genes
cdists = list()
for i, ctype in enumerate(CELL_TYPES):
cell_expr = pm.MvNormal(ctype, mu=TRUE_MEAN_MAT[:,i], cov=COV_MATS[i], shape=(2,))
cdists.append(pm.Deterministic('{}_exp'.format(ctype), tt.exp(cell_expr)))
expr = pm.Deterministic('expr', tt.log(cell_mix[0] * cdists[0] + \
cell_mix[1] * cdists[1] + \
cell_mix[2] * cdists[2] + \
cell_mix[3] * cdists[3] + \
cell_mix[4] * cdists[4]))
samples = pm.sample(draws=N_OBS, n_init=50000, chains=1)
samples['expr'].shape # (100, 2)
But (taking the cell_mix proportions as pre-specified) I cannot perform inference:
mod_input = samples['cell_proportions']
mod_output = samples['expr']
with pm.Model() as model_infer:
# define the prior parameters. Here we *observe* the mixture proportions and the cell expression
cpriors = list()
for i, ctype in enumerate(CELL_TYPES):
prior_mu = pm.MvNormal('{}_mu'.format(ctype), mu=np.array([0., 0.]), cov=MAT_I, shape=(2,))
prior_cov_pl = pm.LKJCholeskyCov('{}_cv_pl'.format(ctype), eta=1., sd_dist=pm.HalfCauchy.dist(1.), n=2)
prior_cov_L = pm.expand_packed_triangular(2, prior_cov_pl)
prior_cov = pm.Deterministic('{}_cv'.format(ctype), prior_cov_L.dot(prior_cov_L.T))
cell_expr = pm.MvNormal(ctype, mu=prior_mu, cov=prior_cov, shape=(2,))
cpriors.append(cell_expr)
# define the sampling for each sample
# In PyMC3 it is not possible to *observe* a sum (i.e. you can't pass data= to a deterministic)
# but i think we can cheat here
obs_expr = list()
for i in xrange(samples['expr'].shape[0]):
sam_expr = pm.Deterministic('expr{}'.format(i),
tt.log(mod_input[i, 0] * tt.exp(cpriors[0]) + \
mod_input[i, 1] * tt.exp(cpriors[1]) + \
mod_input[i, 2] * tt.exp(cpriors[2]) + \
mod_input[i, 3] * tt.exp(cpriors[3]) + \
mod_input[i, 4] * tt.exp(cpriors[4])))
obs_expr.append(sam_expr)
obs_expr = np.array(obs_expr) # 100 x 2
expr_obs = pm.MatrixNormal('expr_obs', mu=expr,
rowcov=0.01 * MAT_I_N, colcov=MAT_I, shape=(N_OBS,2),
observed=mod_output)
mod_fit = pm.sample()
This results in an error:
MissingInputError: Input 0 of the graph (indices start from 0), used to compute Elemwise{exp,no_inplace}(Endothelial), was not provided and not given a value. Use the Theano flag exception_verbosity=‘high’, for more information on this error.
But I’ve provided priors for the group (cell) mean and covariance. What’s wrong?