# Here explore our age guessing data collected in class. # This vector (called A) contains the actual reported ages of our 10 subjects in years. A=c(35,44,48,42,23,44,54,22,54,26) # This array contains the guesses made by our 9 in class groups. The rows are the rows samples and the columns are the group. So G[i,j] is the person i's age as guessed by group j. G=cbind( c(36,30,37,32,25,31,62,23,43,24), c(27,32,43,37,24,35,56,22,43,23), c(29,38,35,36,22,30,55,24,43,22), c(26,27,36,32,28,35,58,24,43,27), c(29,25,41,22,19,33,58,24.5,38,22), c(32,30,37,28,28,34,56,26,42,25), c(35,40,55,36,30,35,60,27,41,22), c(26,30,43,21,21,33,58,18,42,17), c(35,28,41,33,29,32,60,27,43,24) ) # Here is matrix of the errors, ordered as above. G-A # We can look at them all after sorting them, and summarize some of the key statistics about them. Errors=sort(c(G-A)) summary(c(Errors)) mean(c(G-A)) sd(c(G-A)) # We can plot a version of the summary called the box plot. boxplot(c(Errors)) # We can graph a cumulative distribution function then tells us the percentile associated to each error value. Percentile= 100*seq(1,length(c(G-A)))/length(c(G-A)) plot(Errors,Percentile,main="The Percentile of Every Error Value") # Another way to view the data is the Histogram (the "derivative" of the cumulative distribution function). hist(c(G-A),nclass=6,main="A Histogram of the Error Values") # Question: This histogram looks bimodal. Why might this be? # One answer: The process of evaluating male and female ages might be very different. # Here is a histogram of our Female Errors LW=c(2,3,5,6,9) Women=G[c(LW),]-A[LW] hist(Women,nclass=6,main="Our Histogram Conditioned on Female Samples") mean(c(Women)) # Here is a histogram of our Female Errors quartz() LM=c(1,4,7,8,10) Men=G[c(LM),]-A[LM] hist(Men,nclass=6,main="Our Histogram Conditioned on Male Samples") mean(c(Men)) # Question: Do the distributions we see help explain the bi-modality? # Question: Redo the above exploration exploring age as a cause of the bi-modality. Can age be used to help us come to terms with the skewness and tails of our Female and Male distributions? ######## # Suppose I was instead interested in the difference not between your guess and the true ages, but rather the differences between in you perceptions of the picture. How might I compare the your answers for different samples? #Answer: normalize. M1=margin.table(G,1) M=M1/9 Centered=G-c(M) sd=sqrt(margin.table(Centered^2,1)/(10-1)) Normalized=Centered/c(sd) sort(Normalized) hist(Normalized,nclass=6) # We should compare this to the normal. X=seq(-3,3,by=1/(2*3)) hist(Normalized,freq=FALSE) points(X,dnorm(X,0,1),col="blue",type="l") ########### # Question: how to rank which group did best? (absolute value - mean or median - price for bad descisions notice this changes the results!) #Numerical AErrors=c() for (i in 1:length(G[1,])){ AErrors=rbind(AErrors,abs(G[,i]-A)) } AErrors #Summary for groups summary(abs(G-A)) #Summary for Samples summary(aperm(abs(G-A),c(2,1))) # Pictorial. With 10 samples a box plot is pretty weak but we can see... MakeList=list(abs(G[,1]-A)) for (i in 2:length(G[1,])){ MakeList=c(MakeList,list(abs(G[,i]-A))) } MakeList boxplot(MakeList)