Hello,
I’ve just finished the rethinking book and wanted to try out modeling a simple regression then package the model to call from an API. I decided to use a current kaggle data set found here:
kaggle competitions download -c elo-merchant-category-recommendation
I’m trying this one coefficient at a time and can’t get past the first one. The code is not efficient but i’m just trying to get to the modeling part.
x = …train.csv
dummy_feature1 = pd.get_dummies(x['feature_1'], drop_first = True, prefix = 'feature_1')
x = x.drop(['feature_1'], axis = 1)
x = pd.concat([x, dummy_feature1], axis = 1)
dummy_feature2 = pd.get_dummies(x['feature_2'], drop_first = True, prefix = 'feature_2')
dummy_feature3 = pd.get_dummies(x['feature_3'], drop_first = True, prefix = 'feature_ 3')
x = pd.concat([x, dummy_feature2], axis = 1)
x = pd.concat([x, dummy_feature3], axis = 1)
x = x.drop(['feature_2', 'feature_3'], axis = 1)
import datetime
x['first_active_month'] = pd.to_datetime(x['first_active_month'])
year'] = x['first_active_month'].dt.year
x['month'] = x['first_active_month'].dt.month
dummy_year = pd.get_dummies(x['year'], drop_first=True, prefix='year')
dummy_month = pd.get_dummies(x['month'], drop_first=True, prefix = 'month')
x = pd.concat([x, dummy_year, dummy_month], axis = 1)
x.iloc[:,2:10] = x.iloc[:,2:10].astype(int)
x.iloc[:,17: ] = x.iloc[:,17: ].astype(int)
x = pd.concat([x.iloc[:,2:10], x.iloc[:,17:]], axis = 1)
model_input = tt.shared(np.array(x['feature_1_2']))
model_output = tt.shared(np.array(x['target']))
with pm.Model() as linear_model:
alpha = pm.Normal('intercept', mu = 0, sd = 10)
beta_1 = pm.Normal('beta_feature1_2', mu = 0, sd = 1)
#beta_2 = pm.Normal('beta_feature1_3', mu = 0, sd = 1, shape = (len(x), 1))
#beta_3 = pm.Normal('beta_feature1_4', mu = 0, sd = 1, shape = (len(x), 1))
#beta_4 = pm.Normal('beta_feature1_5', mu = 0, sd = 1, shape = (len(x), 1))
#beta_5 = pm.Normal('beta_feature2_2', mu = 0, sd = 1, shape = (len(x), 1))
#beta_6 = pm.Normal('beta_feature2_3', mu = 0, sd = 1, shape = (len(x), 1))
#beta_7 = pm.Normal('beta_feature3_1', mu = 0, sd = 1, shape = (len(x), 1))
#beta_8 = pm.Normal('beta_month2', mu = 0, sd = 1, shape = (len(x), 1))
#beta_9 = pm.Normal('beta_month3', mu = 0, sd = 1, shape = (len(x), 1))
#beta_10 = pm.Normal('beta_month4', mu = 0, sd = 1, shape = (len(x), 1))
#beta_11 = pm.Normal('beta_month5', mu = 0, sd = 1, shape = (len(x), 1))
#beta_12 = pm.Normal('beta_month6', mu = 0, sd = 1, shape = (len(x), 1))
#beta_13 = pm.Normal('beta_month7', mu = 0, sd = 1, shape = (len(x), 1))
#beta_14 = pm.Normal('beta_month8', mu = 0, sd = 1, shape = (len(x), 1))
#beta_15 = pm.Normal('beta_month9', mu = 0, sd = 1, shape = (len(x), 1))
#beta_16 = pm.Normal('beta_month10', mu = 0, sd = 1, shape = (len(x), 1))
#beta_17 = pm.Normal('beta_month11', mu = 0, sd = 1, shape = (len(x), 1))
#beta_18 = pm.Normal('beta_month12', mu = 0, sd = 1, shape = (len(x), 1))
sigma = pm.Uniform('sigma', lower = 0, upper = 10)
mu = pm.Normal('mu', alpha + beta_1*x['feature_1_2'] ) #+ beta_2*x['feature_1_3'] + beta_3*x['feature_1_4'] + beta_4*x['feature_1_5'] + beta_5*x['feature_2_2'] + beta_6*x['feature_2_3'] + beta_7*x['feature_ 3_1'] + beta_8*x['month_2'] + beta_9*x['month_3'] + beta_10*x['month_4'] + beta_11*x['month_5'] + beta_12*x['month_6'] + beta_13*x['month_7'] + beta_14*x['month_8'] + beta_15*x['month_9']+ beta_16*x['month_10'] + beta_17*x['month_11'] + beta_18*x['month_12'] )
Target = pm.Normal('target', mu = mu, sd = sigma, observed = model_output)
trace_linear = pm.sample(1000, tune = 1000, step= NUTS())
Then I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-228-87a83e72a462> in <module>()
21 sigma = pm.Uniform('sigma', lower = 0, upper = 10)
22
---> 23 mu = pm.Normal('mu', alpha + beta_1*model_input ) #+ beta_2*x['feature_1_3'] + beta_3*x['feature_1_4'] + beta_4*x['feature_1_5'] + beta_5*x['feature_2_2'] + beta_6*x['feature_2_3'] + beta_7*x['feature_ 3_1'] + beta_8*x['month_2'] + beta_9*x['month_3'] + beta_10*x['month_4'] + beta_11*x['month_5'] + beta_12*x['month_6'] + beta_13*x['month_7'] + beta_14*x['month_8'] + beta_15*x['month_9']+ beta_16*x['month_10'] + beta_17*x['month_11'] + beta_18*x['month_12'] )
24
25 Target = pm.Normal('target', mu = mu, sd = sigma, observed = model_output)
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\pymc3\distributions\distribution.py in __new__(cls, name, *args, **kwargs)
40 total_size = kwargs.pop('total_size', None)
41 dist = cls.dist(*args, **kwargs)
---> 42 return model.Var(name, dist, data, total_size)
43 else:
44 raise TypeError("Name needs to be a string but got: {}".format(name))
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\pymc3\model.py in Var(self, name, dist, data, total_size)
806 with self:
807 var = FreeRV(name=name, distribution=dist,
--> 808 total_size=total_size, model=self)
809 self.free_RVs.append(var)
810 else:
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\pymc3\model.py in __init__(self, type, owner, index, name, distribution, total_size, model)
1206 self.tag.test_value = np.ones(
1207 distribution.shape, distribution.dtype) * distribution.default()
-> 1208 self.logp_elemwiset = distribution.logp(self)
1209 # The logp might need scaling in minibatches.
1210 # This is done in `Factor`.
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\pymc3\distributions\continuous.py in logp(self, value)
445 mu = self.mu
446
--> 447 return bound((-tau * (value - mu)**2 + tt.log(tau / np.pi / 2.)) / 2.,
448 sd > 0)
449
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\theano\tensor\var.py in __sub__(self, other)
145 # and the return value in that case
146 try:
--> 147 return theano.tensor.basic.sub(self, other)
148 except (NotImplementedError, AsTensorError):
149 return NotImplemented
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\theano\gof\op.py in __call__(self, *inputs, **kwargs)
613 """
614 return_list = kwargs.pop('return_list', False)
--> 615 node = self.make_node(*inputs, **kwargs)
616
617 if config.compute_test_value != 'off':
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\theano\tensor\elemwise.py in make_node(self, *inputs)
480 inputs = list(map(as_tensor_variable, inputs))
481 out_dtypes, out_broadcastables, inputs = self.get_output_info(
--> 482 DimShuffle, *inputs)
483 outputs = [TensorType(dtype=dtype, broadcastable=broadcastable)()
484 for dtype, broadcastable in izip(out_dtypes,
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\theano\tensor\elemwise.py in get_output_info(self, dim_shuffle, *inputs)
436 args.append(dim_shuffle(
437 input.type.broadcastable,
--> 438 ['x'] * difference + list(range(length)))(input))
439 inputs = args
440
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\theano\gof\op.py in __call__(self, *inputs, **kwargs)
623 for i, ins in enumerate(node.inputs):
624 try:
--> 625 storage_map[ins] = [self._get_test_value(ins)]
626 compute_map[ins] = [True]
627 except AttributeError:
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\theano\gof\op.py in _get_test_value(cls, v)
560 # ensure that the test value is correct
561 try:
--> 562 ret = v.type.filter(v.tag.test_value)
563 except Exception as e:
564 # Better error message.
~\AppData\Local\conda\conda\envs\theano\lib\site-packages\theano\tensor\type.py in filter(self, data, strict, allow_downcast)
176 raise TypeError("Wrong number of dimensions: expected %s,"
177 " got %s with shape %s." % (self.ndim, data.ndim,
--> 178 data.shape))
179 if not data.flags.aligned:
180 try:
TypeError: For compute_test_value, one input test value does not have the requested type.
The error when converting the test value to that variable type:
Wrong number of dimensions: expected 0, got 1 with shape (201917,).`
I’m not sure what I’m doing wrong.