By the way, it’s possible to avoid MH while simultaneously adding some intuition that the change-point will be somewhere “in the middle” of the trend, by using a scaled Beta (nb the hyperprior is enormously helpful for speeding up sampling):
with pm.Model() as switchpoint_model_noMH:
# Scaled beta mixture "somewhere in the middle" prior
a = pm.HalfNormal('a_hp', 3.)
b = pm.HalfNormal('b_hp', 3.)
sp_latent = pm.Beta('sp_latent', a, b)
switchpoint = pm.Deterministic('switchpoint', X.min() + (X.max() - X.min()) * sp_latent)
# Priors
beta1_offset = pm.Normal('beta1_offset')
beta1 = pm.Deterministic("beta1", beta1_offset * 3)
beta2_offset = pm.Normal('beta2_offset')
beta2 = pm.Deterministic("beta2", beta2_offset * 3)
intercept_offset = pm.Normal('intercept_offset', mu=0, sd=1)
intercept = pm.Deterministic("intercept", intercept_offset * 3)
std = pm.HalfNormal('std', sd=0.5)
# Likelihood
part1 = intercept + pm.math.dot(pm.math.sin(X), beta1)
part2 = intercept + pm.math.dot(pm.math.sin(X), beta1) + \
pm.math.dot(pm.math.sqr(X), beta2)
#model = pm.math.switch(tt.gt(switchpoint, X), part1, part2)
w = tt.nnet.sigmoid(2*(X-switchpoint))
model = (1-w)*part1 + w*part2
y_lik = pm.Normal('y_lik', mu=model, sd=std, observed = Y)
trace = pm.sample(1000, init='advi+adapt_diag', tune=tune + burn_in, chains=4, cores=3)
ppc = pm.sample_posterior_predictive(trace, 1000)