# read the data
from read_data_fx import read_tb_data
data=read_tb_data(1,27)
# Age is X
X=data[:,1]
# Muscle mass is Y
Y=data[:,0];
# import the packages required
import numpy as np
import scipy as sc
import statsmodels.api as sm
import matplotlib.pyplot as plt
barX=np.mean(X); barY=np.mean(Y)
XminusbarX=X-barX; YminusbarY=Y-barY
b1=sum(XminusbarX*YminusbarY)/sum(XminusbarX**2)
b0=barY-b1*barX
Yhat=b0+b1*X
e_i=Y-Yhat # Residuals
print b1
Coefficient of Determination
$R^2=1-\frac{SSE}{SSTO}$
ssto=np.sum(YminusbarY**2); sse=np.sum(e_i**2)
Rsq=1-sse/ssto
print 'R^2=%4.3f' % Rsq
Coefficient of Correlation
$r=\pm \sqrt{R^2}$
r= np.sign(b1)*np.sqrt(Rsq)
print 'r=%4.3f' %r
from scipy.stats.stats import pearsonr
r,rx=pearsonr(X,Y)
print 'r=%4.3f' %r
Marginal distributions: $Y_1$ and $Y_2$
# import the mlab package from matplotlib
import matplotlib.mlab as mlab
%matplotlib inline
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
x=np.arange(-5,5,0.01)
X=mlab.normpdf(x,0,1)
plt.plot(x,X,'b-')
plt.xlabel(r'$Y_1$',fontsize=25)
plt.ylabel(r'$P(Y_1)$',fontsize=25)
plt.subplot(1,2,2)
Y=mlab.normpdf(x,0,1)
plt.plot(x,Y,'r-')
plt.xlabel(r'$Y_2$',fontsize=25)
plt.ylabel(r'$P(Y_2)$',fontsize=25)
plt.tight_layout()
# plotting packages
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
# plotting the 3d surface
y1 = np.arange(-3.0, 3.0,0.1)
y2 = np.arange(-2.0, 2.0, 0.1)
Y1, Y2 = np.meshgrid(y1, y2)
Z1 = mlab.bivariate_normal(Y1, Y2, 1.0, 1.0, 0.0, 0.0,0.0)
fig = plt.figure(figsize=(15,10))
ax = fig.gca(projection='3d')
surf = ax.plot_surface(Y1, Y2, Z1, rstride=1, cstride=1, cmap=cm.rainbow,
linewidth=0, antialiased=False)
fig.colorbar(surf, shrink=0.5, aspect=5)
ax.set_xlabel(r'$Y_1$',fontsize=25)
ax.set_ylabel(r'$Y_2$',fontsize=25);
y1 = np.arange(-3.0, 3.0,0.1)
y2 = np.arange(-0.25, 2.0, 0.1)
Y1, Y2 = np.meshgrid(y1, y2)
Z1 = mlab.bivariate_normal(Y1, Y2, 1.0, 1.0, 0.0, 0.0,0.0)
fig = plt.figure(figsize=(15,10))
ax = fig.gca(projection='3d')
surf = ax.plot_surface(Y1, Y2, Z1, rstride=1, cstride=1, cmap=cm.rainbow,
linewidth=0, antialiased=False)
fig.colorbar(surf, shrink=0.5, aspect=5)
ax.set_xlabel(r'$Y_1$',fontsize=25)
ax.set_ylabel(r'$Y_2$',fontsize=25);
ax.view_init(30, -70);
In correlation model $X$ is taken as $Y_1$ and $Y$ is taken as $Y_2$
# read the data
from read_data_fx import read_tb_data
data=read_tb_data(1,27)
X=data[:,1]; Y=data[:,0];
Y1=X;Y2=Y;
barY1=np.mean(Y1); barY2=np.mean(Y2)
Y1minusbarY1=Y1-barY1; Y2minusbarY2=Y2-barY2
r12=np.sum(Y1minusbarY1*Y2minusbarY2)/np.sqrt(np.sum(Y1minusbarY1**2)*np.sum(Y2minusbarY2**2))
print 'r_12= %4.3f' %r12
Test statistics for $r_{12}$ is given by
$t^*=\frac{r_{12}\sqrt{n-2}}{\sqrt{1-r_{12}^2}}$
$H_0: \rho_{12}=0$
$H_a: \rho_{12} \neq 0$
Control Type I error (incorrect rejection of null hypothesis) at $\alpha$ level
If $|t^*| \leq t(1-\alpha/2;n-2)$ then conclude $H_0$
If $|t^*| \gt t(1-\alpha/2;n-2)$ then conclude $H_a$
n=len(Y1)
tstar=(r12*np.sqrt(n-2))/np.sqrt(1-r12**2)
# Packages for importing t-statistics
from scipy import stats
# For \alpha=0.05
t=stats.t.ppf(0.975,n-2)
print '|tstar|= %4.4f' % abs(tstar),', t= %4.4f' %t
As $|t^*| > t$ we conclude $H_a$
# p-value
pval=stats.t.sf(abs(tstar),n-2); pval=2.0*pval; print 'Two sided p-value = %4.4f' % pval
As p-value < $\alpha$ we can conclude $H_a$
Fisher transformation:
$z'=\frac{1}{2} \ln \frac{1+r_{12}}{1-r_{12}}= \tanh^{-1}(r_{12}) $
$E\{z'\}=\zeta=\frac{1}{2} \ln \frac{1+\rho_{12}}{1-\rho_{12}}= \tanh^{-1}(\rho_{12}) $
$\sigma\{z'\}=\frac{1}{n-3}$
And the $1-\alpha$ interval for $\zeta$ is
$z' \pm z(1-\alpha/2)\sigma\{z'\}$
zdash=np.arctanh(r12);sigma_zdash=1.0/(n-3.0)
delz_dash=stats.norm.ppf(0.975)*sigma_zdash
print '1-alpha interval for zeta is'
#print ' %4.4f +/- %4.4f' %(zdash, delz_dash)
delz_dash_1=zdash+delz_dash
delz_dash_2=zdash-delz_dash
print '(%4.4f, %4.4f )' %(delz_dash_2,delz_dash_1)
Therefore, $-1.3515 \leq \zeta \leq -1.2827$
From Fisher's transformation we also get
$\rho_{12}=\frac{e^{2\zeta}-1}{e^{2\zeta}+1}= \tanh(\zeta) $
rho12_1=np.tanh(zdash+delz_dash)
rho12_2=np.tanh(zdash-delz_dash)
print '1-alpha interval for rho_12 is'
print '(%4.4f, %4.4f )' %(rho12_2,rho12_1)
Therefore, $-0.8744 \leq \rho_{12} \leq -0.8572$
$r_s=\frac{\sum (R_{i1}-\overline{R_1})(R_{i2}-\overline{R_2})} { \bigg [ \sum (R_{i1}-\overline{R_1})^2 \sum(R_{i2}-\overline{R_2})^2 \bigg ]^{1/2} }$ and
$\overline{R_1}=\overline{R_2}=(n+1)/2$
R1=stats.rankdata(Y1);R2=stats.rankdata(Y2)
barR1=(n+1.0)/2.0; barR2=barR1
R1minusbarR1=R1-barR1; R2minusbarR2=R2-barR2
rs=np.sum(R1minusbarR1*R2minusbarR2)/np.sqrt(np.sum(R1minusbarR1**2)*np.sum(R2minusbarR2**2))
print 'rs= %4.3f' %rs
Compare with in built python function for spearman correlation
print stats.spearmanr(Y1,Y2)[0]
Test statistics for $r_{s}$ is given by
$t^*=\frac{r_{s}\sqrt{n-2}}{\sqrt{1-r_{s}^2}}$
$H_0:$ There is no association between $Y_1$ and $Y_2$
$H_a:$ There is an association between $Y_1$ and $Y_2$