# Today we explore the first steps one would take in comparing several means.  The situation is as follows:  we have have an experiment where we can vary a single parameter in k different ways.  For example, we can imagine that we are measuring the difference in temperature of water which is initially at 180 degrees and left to sit an insulated coffee cup for half an hour with the mugs produced by one of k=4 different manufacturers.  Then in each category we test the mean by performing the experiment n times, and call this sample xi.  For each of our mugs we imagine performing the experiment n=8 times.   The assumptions one needs to make are that each xi are idependently sampled from rnorm(mui,sigma) - with sigma independent of i.  As usual,  one needs to look at the data to see if this assumption is justified. Here is some data:


# God knows:

mu=c(10,5,16,10)
sigma=4

# In your experiment you have:

k=length(mu)
n=8

# And the data you see is:

x1=rnorm(n,mu[1],sigma)
x2=rnorm(n,mu[2],sigma)
x3=rnorm(n,mu[3],sigma)
x4=rnorm(n,mu[4],sigma)
x=cbind(x1,x2,x3,x4)
x

#Note: a negative number experimental error or a eating mug!  We can look at the data via box plot (which played a vital role in understanding our statistic.)

boxplot(list(x[,1],x[,2],x[,3],x[,4]))

# We can look at the means

xbar=apply(x,2,mean)


# and variances 
varx=apply(x,2,var)

# In general you will have the following table:

rbind(x,xbar,varx)

# Under the Null hypothesis,  you can estimate sigma^2 in two ways, the mean square treatment and mean square error


MST=n*var(xbar)

MSE=mean(varx)

# Looking at the box plot we discussed that if the means are not equal  MST should be larger than MSE, and hence the following F will typically be bigger than 1. 

F=MST/MSE
F

# This is the F-statisitic which under the null will be distributed via the F distribution with degrees of freedom  dfnum and dfdenom (we simulated it below). So we can get a P-Value. 

dfnum=k-1
dfdenom=k*(n-1)

P=1-pf(F,dfnum,dfdenom)

P

####### Now let's do our verification of F under the Null.  Namely, let us simulate the Null and see if the distribution looks roughly like the F-distribution with degrees of freedom  dfnum and dfdeno.

mu=c(10,10,10,10)
k=length(mu)
sigma=4
n=8

Trials=10000

FList=c()
for (j in 1:Trials){
x1=rnorm(n,mu[1],sigma)
x2=rnorm(n,mu[2],sigma)
x3=rnorm(n,mu[3],sigma)
x4=rnorm(n,mu[4],sigma)
x=cbind(x1,x2,x3,x4)
xbar=apply(x,2,mean)
varx=apply(x,2,var)
MST=n*sd(xbar)^2
MSE=mean(varx)
F=MST/MSE
FList=c(FList,F)
}

dfnum=k-1
dfnum
dfdenom=k*(n-1)
dfdenom
X=seq(0,max(FList),by=0.01)
hist(FList[FList<2*(k-1)],freq=FALSE,main="Exploring the F-ratio under the Null",xlab="Values",ylim=c(0,max(df(X,dfnum,dfdenom))))
points(X,df(X,dfnum,dfdenom),type="l",col="red")


####### Now let's do something akin to a power analysis. Namely, let us pick a scenario for our means such that IF it were true, THEN we'd like to be able to reject the Null with a high probability. We simply use our first choices of mu.


mu=c(10,5,16,10)
k=length(mu)
sigma=4
n=8

Trials=1000

FList=c()
for (j in 1:Trials){
x1=rnorm(n,mu[1],sigma)
x2=rnorm(n,mu[2],sigma)
x3=rnorm(n,mu[3],sigma)
x4=rnorm(n,mu[4],sigma)
x=cbind(x1,x2,x3,x4)
xbar=apply(x,2,mean)
varx=apply(x,2,var)
MST=n*sd(xbar)^2
MSE=mean(varx)
F=MST/MSE
FList=c(FList,F)
}

dfnum=k-1
dfdenom=k*(n-1)
hist(FList,freq=FALSE,main="Exploring the F-ratio for our model",xlab="Values")
X=seq(0,max(FList),by=0.01)
points(X,df(X,dfnum,dfdenom),type="l",col="red")

# Question: If your critical value was 6, then estimate this test's power.