Rotation Matrix Convergence (no divergences)?

I am trying to understand why my inference keeps returning chains that do not converge even when I have no divergences.

I have a random (3 X 3) rotation matrix that I use to rotate a (M X 3) array of data. I then add random Gaussian noise to each column, same variance for each column. I had trouble doing the sampling on the 9 elements of the (3 X 3) rotation matrix so I transformed them into their angles. (Updated with new image and correct Theano implementation for yaw)



Code Below


import numpy as np
import pymc3 as pm
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import arviz as az
import scipy.linalg as la
import theano.tensor as tt
import theano.tensor.nlinalg as tt_la
from math import (asin, pi, atan2, cos, sin)

 # Function Needed
def yaw_pitch_roll_inverse_calc(Q):
    if Q[2,0] != 1 or Q[2,0] != -1: 
        pitch_1 = -1*asin(Q[2,0])
        pitch_2 = pi - pitch_1 
        roll_1 = atan2( Q[2,1] / cos(pitch_1) , Q[2,2] /cos(pitch_1) ) 
        roll_2 = atan2( Q[2,1] / cos(pitch_2) , Q[2,2] /cos(pitch_2) ) 
        yaw_1 = atan2( Q[1,0] / cos(pitch_1) , Q[0,0] / cos(pitch_1) )
        yaw_2 = atan2( Q[1,0] / cos(pitch_2) , Q[0,0] / cos(pitch_2) ) 

        pitch = pitch_1 
        roll = roll_1
        yaw = yaw_1 
    else: 
        yaw = 0 # anything (we default this to zero)
        if Q[2, 0] == -1: 
            pitch = pi/2 
            roll = yaw + atan2(Q[0, 1], Q[0, 2]) 
        else: 
            pitch = -pi/2 
            roll = -1*yaw + atan2(-1*Q[0, 1],-1*Q[0, 2]) 

    # convert from radians to degrees
    roll = roll*180/pi 
    pitch = pitch*180/pi
    yaw = yaw*180/pi 

    rxyz_deg = [yaw, pitch, roll] 
    return rxyz_deg

np.random.seed(9002)

rotation = np.random.randint(low=1, high=100, size=(3,3))  # Create    Random 3X3 Matrix
Q = np.transpose(la.orth(rotation))  # Orthogonalize it

M = 10_000  # Number of Data Points
D = 3  # Number of Dimensions
shape = np.random.uniform(low=0, high=20, size=(M, D))  # create an a time series object with 3-spatial coords
noise = np.random.normal(loc=0.0, scale=3.25, size=(M, D))  # Create Noise

#  Only Rotation
target = (shape @ Q) + noise  # Make the target shape 

if __name__ == '__main__':
    with pm.Model() as model_Q:
         # Priors for unknown model parameter
        sigma_a = pm.Lognormal('sigma_a', mu=0, sigma=1)
        sigma_b = pm.Lognormal('sigma_b', mu=0, sigma=1)
        sigma_g = pm.Lognormal('sigma_g', mu=0, sigma=1)
    
        alpha = pm.Normal("alpha", mu=a, sigma=sigma_a)
        beta = pm.Normal("beta", mu=b, sigma=sigma_b)
        gamma = pm.Normal("gamma", mu=g, sigma=sigma_g)

        # Math on Priors
        yaw = tt.stack([[tt.cos(gamma), -tt.sin(gamma), 0,
                          tt.sin(gamma), tt.cos(gamma), 0,
                          0, 0, 1]).reshape((3, 3))
    
        pitch = tt.stack([tt.cos(beta), 0, tt.sin(beta),
                          0, 1, 0,
                          -tt.sin(beta), 0,  tt.cos(beta)]).reshape((3, 3))
    
        roll = tt.stack([1, 0, 0,
                          0, tt.cos(alpha), -tt.sin(alpha),
                          0, tt.sin(alpha),  tt.cos(alpha)]).reshape((3, 3))
    
        T = tt.dot(tt.dot(yaw, pitch), roll)
    
    
        sigma = pm.HalfNormal("sigma", sigma=3.5)

        # Expected value of outcome
       mu = tt.dot(shape, T)

        # Likelihood (sampling distribution) of observations
        Y_obs = pm.Normal("Y_obs", mu=mu, sigma=sigma, observed=target)

    with model_Q:
        # draw 500 posterior samples
        trace_Q = pm.sample(4000, tune=4000, cores=4, target_accept=0.95)

I think it is quite possible to have non convergence without divergences (despite the names, they are quite different concepts). It seems like your model is strongly multimodal and that the chains for alpha and beta simply don’t mix. This can happen when they get stuck at different initialization points / modes for the entire run.