# read the data
from read_data_fx import read_tb_data
data=read_tb_data(1,19)
#ACT test score (X)
X=data[:,1]
#GPA at the end of the freshman year (Y)
Y=data[:,0];
# import pacakages and functions
import numpy as np
import matplotlib.pylab as plt
from scipy import stats
def lin_reg(X,Y):
barX=np.mean(X); barY=np.mean(Y)
XminusbarX=X-barX; YminusbarY=Y-barY
b1=sum(XminusbarX*YminusbarY)/sum(XminusbarX**2)
b0=barY-b1*barX
Yhat=b0+b1*X
e_i=Y-Yhat
sse=np.sum(e_i**2)
ssr=np.sum((Yhat-barY )**2)
return {'Yhat':Yhat,'b0':b0,'b1':b1,'e_i': e_i,'sse':sse,'ssr':ssr}
# The ANOVA function, it uses lin_reg function described above
def ANOVA(X,Y):
linreg=lin_reg(X,Y)
n=len(X)
sse=linreg['sse']
ssr=linreg['ssr']
df_sse=n-2.0;mse=sse/df_sse
df_ssr=1.0; msr=ssr/df_ssr
barY=np.mean(Y)
ssto = np.sum((Y-barY)**2)
df_ssto=n-1.0
eq1="\\sigma^2+{\\beta_1}^2\\sum(X_i-\\overline{X})^2"
eq2="\sigma^2"
print "| Source | {0: >3s} | {1: >3s} | {2: >3s} | {3: >3s} ".format('SS','df','MS','E{MS}')
print "-----------------------------------------------------------------------"
print "| Regression | {0: >5.2f} | {1: >3d} | {2: >5.2f} | {3: >3s} ".format(ssr,np.int32(df_ssr),msr,eq1)
print "| Error | {0: >5.2f} | {1: >3d} | {2: >5.2f} | {3: >3s} ".format(sse,np.int32(df_sse),mse,eq2)
print "| Total | {0: >5.2f} | {1: >3d} | {2: >3s} |".format(ssto,np.int32(df_ssto),' ')
ANOVA(X,Y)
$MSR$ stands for regression mean of square, and it gives the part of the variance of Y variable (response variable) which is associated with the regression. In the given example $MSR$ represents the part of the variance of GPA that we can be explained by including ACT scores in the regression model.
$MSE$ stands for error mean square, and it gives the variance of Y variable around the fitted regression line. In the given example it will give the variance in GPA around the fitted regression line, where ACT score has been treated as a predictor variable.
We know that $E\{MSE\}=\sigma^2$ and $E\{MSR\}= \sigma^2+{\beta_1}^2\sum(X_i-\overline{X})^2$, therefore when $\beta_1=0$ the mean of $MSE$ and $MSR$ is equal to $\sigma^2$. Hence, when $\beta_1=0$, the sampling distribution of $MSR$ and $MSE$ are located identically and $MSR$ and $MSE$ will tend to be the same order of magnitude.
The absolute magnitude of the reduction in the variation of $Y$ when $X$ is $SSR$.
In this example it is $3.59$.
Relative reduction is $R^2=\frac{SSR}{SSTO}$
In this example it is $\frac{3.59}{49.41}=0.073$ i.e., there is $7\%$ relative reduction in the variation of $Y$ when $X$ is introduced.
$R^2$ is called the coefficient of determination .
%matplotlib inline
plt.figure(figsize=(10,8))
b0=lin_reg(X,Y)['b0']
b1=lin_reg(X,Y)['b1']
X1=np.arange(20,30,0.1)
Y1=b0+b1*X1
plt.scatter(X, Y, c = 'c', s = 85)
yhat=lin_reg(X,Y)['Yhat']
plt.plot(X, yhat,linewidth=2,color='k',linestyle='-')
plt.plot(X1, Y1,linewidth=8,color='r')
plt.xlabel('ACT Score',fontsize=18)
plt.ylabel('GPA',fontsize=18)
txt="Redline indicates the superimposed regression line for ACT scores between 20 and 30. Black line indicates the regression line for the complete data"
plt.text(8.75,-0.75,txt,fontsize=15)
plt.show()
Full model:
$Y_i=\beta_0+\beta_1X_i+\epsilon_i$,
Reduced model:
$Y_i=\beta_0+5X_i+\epsilon_i$,
Degrees of freedom for the reduced model $df_R=n−1$.
Full model:
$Y_i=\beta_0+\beta_1X_i+\epsilon_i$,
Reduced model:
$Y_i=2+5X_i+\epsilon_i$,
Degrees of freedom for the reduced model $df_R=n$.
Alternative conclusions for the test:
$H_0: \beta_0=750 0$ (reduced model holds)
$H_0: \beta_0 \neq 7500 $
Full model:
$Y_i=\beta_0+\beta_1X_i+\epsilon_i$,
Reduced model:
$Y_i=7500+\beta_1X_i+\epsilon_i$,
Degrees of freedom for the reduced model $df_R=n−1$.
Degrees of freedom for the full model are $df_F=n−2$ and degrees of freedom for reduced model are $df_R=n-1$.
Therefore $df_R−df_F=1$.