make categorical, add constant, log the data
>>> import pandas
>>> import statsmodels.api as sm
>>> import numpy as np
>>>
>>> url = "http://www.ats.ucla.edu/stat/mult_pkg/faq/general/lgtrans.csv"
>>> data = pandas.read_csv(url)
>>> print data.female.unique()
['male' 'female']
>>> data['female'] = data.female.replace(dict(male=0, female=1))
>>> data[['math', 'read']] = np.log(data[['math', 'read']])
>>> data['const'] = 1 # sm.add_constant(data)
>>> y_name = 'write'
>>> x_name = ['const', 'female', 'math', 'read']
>>> test_scores = sm.OLS(data[y_name], data[x_name]).fit()
what did I log? I cant remember...
>>> print test_scores.summary()
OLS Regression Results
==============================================================================
Dep. Variable: write R-squared: 0.530
Model: OLS Adj. R-squared: 0.523
Method: Least Squares F-statistic: 73.70
Date: Tue, 23 Oct 2012 Prob (F-statistic): 5.92e-32
Time: 12:23:46 Log-Likelihood: -657.58
No. Observations: 200 AIC: 1323.
Df Residuals: 196 BIC: 1336.
Df Model: 3
==============================================================================
coef std err t P>|t| [95.0% Conf. Int.]
------------------------------------------------------------------------------
const -99.1640 10.804 -9.178 0.000 -120.471 -77.857
female 5.3888 0.931 5.789 0.000 3.553 7.224
math 20.9410 3.431 6.104 0.000 14.175 27.707
read 16.8522 3.063 5.501 0.000 10.811 22.894
==============================================================================
Omnibus: 2.259 Durbin-Watson: 1.999
Prob(Omnibus): 0.323 Jarque-Bera (JB): 2.286
Skew: -0.253 Prob(JB): 0.319
Kurtosis: 2.866 Cond. No. 135.
==============================================================================
What is the predicted test score of this hypothetical student?
>>> X = [1, 1, 65, 70]
>>> print test_scores.predict(X)
2447.04000815
Huh? Oh, right
>>> X = [1, 1, np.log(65), np.log(70)]
>>> print test_scores.predict(X)
65.2368994986
mention Nathaniel

point out the import, patsy can use anything in the calling namespace, uses the tokenize module so it can robustly handle any arbitray Python code
>>> from statsmodels.formula.api import ols
>>>
>>> url = "http://www.ats.ucla.edu/stat/mult_pkg/faq/general/lgtrans.csv"
>>> data = pandas.read_csv(url)
>>> model = "write ~ C(female) + np.log(math) + np.log(read)"
>>> test_scores = ols(model, data=data).fit()
this does what we expect and gives us useful information on the transformations
>>> print test_scores.summary()
OLS Regression Results
==============================================================================
Dep. Variable: write R-squared: 0.530
Model: OLS Adj. R-squared: 0.523
Method: Least Squares F-statistic: 73.70
Date: Tue, 23 Oct 2012 Prob (F-statistic): 5.92e-32
Time: 12:32:14 Log-Likelihood: -657.58
No. Observations: 200 AIC: 1323.
Df Residuals: 196 BIC: 1336.
Df Model: 3
=====================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-------------------------------------------------------------------------------------
Intercept -93.7752 10.766 -8.710 0.000 -115.007 -72.543
C(female)[T.male] -5.3888 0.931 -5.789 0.000 -7.224 -3.553
np.log(math) 20.9410 3.431 6.104 0.000 14.175 27.707
np.log(read) 16.8522 3.063 5.501 0.000 10.811 22.894
==============================================================================
Omnibus: 2.259 Durbin-Watson: 1.999
Prob(Omnibus): 0.323 Jarque-Bera (JB): 2.286
Skew: -0.253 Prob(JB): 0.319
Kurtosis: 2.866 Cond. No. 135.
==============================================================================
>>> X = [['female', 65, 70]]
>>> X = pandas.DataFrame(X, columns=["female", "math", "read"])
>>> print test_scores.predict(X)
[ 65.2368995]
>>>
>>> X = dict(female=['female'], read=[70], math=[65])
>>> print test_scores.predict(X)
[ 65.2368995]

| Operator | Meaning |
|---|---|
| ~ | Separate the left-hand side from the right-hand side. If omitted, formula is assumed right-hand side only. |
| + | Combines terms on either side (set union). |
| - | Removes terms on the right from set of terms on the left (set difference). |
| * | a*b is shorthand for the expansion a + b + a:b. |
| / | a/b is shorthand for the expansion a + a:b. It is used when b is nested within a (e.g., states and counties) |
| : | Computes the interaction between terms on the left and right. |
| ** | Takes a set of terms on the left and an integer |
>>> from patsy import EvalEnvironment, ModelDesc
>>> env = EvalEnvironment.capture()
>>>
>>> formula = "y ~ (a + b):(c + d)"
>>> desc = ModelDesc.from_formula(formula, env)
>>>
>>> desc.describe()
'y ~ a:c + a:d + b:c + b:d'
>>> desc.rhs_termlist
(Term([]),
Term([EvalFactor('a'), EvalFactor('c')]),
Term([EvalFactor('a'), EvalFactor('d')]),
Term([EvalFactor('b'), EvalFactor('c')]),
Term([EvalFactor('b'), EvalFactor('d')]))
>>>
>>> term = desc.rhs_termlist[1]
>>> term.factors
(EvalFactor('a'), EvalFactor('c'))
>>> formula = "(x1*x2)**2" # patsy factors
>>> desc = ModelDesc.from_formula(formula, env)
>>> desc.describe()
'~ x1 + x2 + x1:x2'
>>>
>>> formula = "I((x1+x2)**2)" # Python factors
'~ I((x1 + x2) ** 2)'
>>> from pandas import DataFrame
>>> from patsy import dmatrix
>>>
>>> dta = DataFrame([["lo", 1],["hi", 2.4],["lo", 1.2],["lo", 1.4]],
columns=["carbs", "score"])
>>> dmatrix("carbs + score", dta)
DesignMatrix with shape (4, 3)
Intercept carbs[T.lo] score
1 1 1.0
1 0 2.4
1 1 1.2
1 1 1.4
Terms:
'Intercept' (column 0)
'carbs' (column 1)
'score' (column 2)
>>> dmatrix("C(carbs, Treatment(reference='lo')) + score", dta)
DesignMatrix with shape (4, 3)
Intercept C(carbs, Treatment(reference='lo'))[T.hi] score
1 0 1.0
1 1 2.4
1 0 1.2
1 0 1.4
Terms:
'Intercept' (column 0)
"C(carbs, Treatment(reference='lo'))" (column 1)
'score' (column 2)
if you have never heard of a contrast, then you are likely ok with the default. if you are a statistician, then why aren`t you contributing code to statsmodels?
>>> from patsy import dmatrix, build_design_matrices
>>>
>>> def naive_center(x):
... x = np.asarray(x)
... return x - np.mean(x)
...
>>> data = {"x" : [1, 2, 3, 4]}
>>> mat = dmatrix("naive_center(x)", data)
>>> mat
DesignMatrix with shape (4, 2)
Intercept naive_center(x)
1 -1.5
1 -0.5
1 0.5
1 1.5
Terms:
'Intercept' (column 0)
'naive_center(x)' (column 1)
>>> new_data = {"x" : [5, 6, 7, 8]}
>>> build_design_matrices([mat.design_info.builder], new_data)[0]
DesignMatrix with shape (4, 2)
Intercept naive_center(x)
1 -1.5
1 -0.5
1 0.5
1 1.5
Terms:
'Intercept' (column 0)
'naive_center(x)' (column 1)
>>> fixed_mat = matrix("center(x)", data)
>>> fixed_mat
DesignMatrix with shape (4, 2)
Intercept center(x)
1 -1.5
1 -0.5
1 0.5
1 1.5
Terms:
'Intercept' (column 0)
'center(x)' (column 1)
>>> build_design_matrices([fixed_mat.design_info.builder], new_data)[0]
DesignMatrix with shape (4, 2)
Intercept center(x)
1 2.5
1 3.5
1 4.5
1 5.5
Terms:
'Intercept' (column 0)
'center(x)' (column 1)
class MyTransform(object):
def __init__(self):
self._total = 0
self._count = 0
self._mean = None
def memorize_chunk(self, x):
# Updates internal state, this is called one or more times
self._total += np.sum(x)
self._count += len(x)
def memorize_finish(self):
# this is called once after memorize chunk as called as many times
# as needed, for example, after reading in chunks of data incrementally
self._mean = self.total * 1. / self._count
def transform(self, x):
# this is called one or more times on old and new data
# args are the same as memorize_chunk
return x - self._mean
>>> from numpy import log
>>> url = "http://www.ats.ucla.edu/stat//mult_pkg/faq/general/lgtrans.csv"
>>> data = pandas.read_csv(url)
>>> model = "write ~ female + log(math) + log(read)"
>>> test_scores = ols(model, data=data).fit()
>>>
>>> print test_scores.f_test("log(read) = log(math)")
<F test: F=array([[ 0.47954952]]), p=[[ 0.48944607]], df_denom=196, df_num=1>
>>>
>>> print test_scores.f_test("log(read) = log(math) = 30")
<F test: F=array([[ 34.94492186]]), p=[[ 1.04790910e-13]], df_denom=196, df_num=2>
>>> from patsy import DesignInfo
>>>
>>> info = DesignInfo(test_scores.model.exog_names)
>>> print info.describe()
'1 + C(female)[T.male] + log(math) + log(read)'
>>>
>>> LC = info.linear_constraint("log(read) = 20, log(math) = 20")
>>> LC.coefs # R
array([[ 0., 0., 0., 1.],
[ 0., 0., 1., 0.]])
>>>
>>> LC.constants
array([[20.],
[20.]])
>>>
>>> print np.dot(LC.coefs, test_scores.params)
[ 16.85217528 20.94096826]
| Table of Contents | t |
|---|---|
| Exposé | ESC |
| Full screen slides | e |
| Presenter View | p |
| Source Files | s |
| Slide Numbers | n |
| Toggle screen blanking | b |
| Show/hide slide context | c |
| Notes | 2 |
| Help | h |