# clear the memory
rm(list=ls())

# load the "swiss" dataset (built into R)
data(swiss)

# look at the data
pairs(swiss)

head(swiss)
# Please type
?swiss
# and read about the dataset.

# Our goal is to understand how the covariates
#
# Agriculture,
# Examination,
# Education,
# Catholic, and
# Infant.Mortality
# 
# are related to Fertility.


# Q1. Use either the length() or dim() function to figure out what the sample
# size is. 

# Let's make a table of information about 5 one-factor models.

model.summaries <- data.frame(vars=c("Agriculture","Examination",
	"Education","Catholic","Infant.Mortality"),
	MSE=NA,
	coef.estimate=NA)

model.summaries

# fit all 5 models and fill in the table
for (i in 1:5)
{
	# make the formula
	model.formula <- paste("Fertility~",model.summaries$vars[i])
	fit <- lm(model.formula,data=swiss)

	# look at the anova
	print(anova(fit))
	
	# look at estimates
	print(summary(fit))
	
	# fill in the table
	model.summaries$MSE[i] <- anova(fit)[2,3]
	model.summaries$coef.estimate[i] <- fit$coef[2]
}

# Q2. Which covariates make the unexplained variance in fertility the 
# smallest (in 1 factor model)? What criteria did you use to answer this
# question?

# Q3. Summarize the relationships between each covariate and fertility.

# Q4. Two covariates from Q2 resulted in relatively small MSEs.
# What happens when both of them are used in a multiple
# regression as covariates? Please explain why this happens.

# Q5. The percent of males involved in agriculture as an occupation  
# appears to be positively associated with Fertility. What happens to
# the effect of Agriculture when Education is added to the model? Please 
# explain why.


# Q6. Consider the following plots.
par(mfrow=c(2,2))
fit.Educ <- lm(Fertility~Education,data=swiss)
plot(swiss$Agri,resid(fit.Educ),xlab="Agriculture",ylab="Resid(Educ only model)")
plot(swiss$Cath,resid(fit.Educ),xlab="Catholic",ylab="Resid(Educ only model)")
plot(swiss$Exam,resid(fit.Educ),xlab="Examination",ylab="Resid(Educ only model)")
plot(swiss$Infa,resid(fit.Educ),xlab="Infant Mortality",ylab="Resid(Educ only model)")
# Which models would you want to consider next? Why?

# Q7. Consider a model that uses all the covariates except for examination. 
# In what sense is this model better than the models you have seen so far? 
# Do any of the relationships you found in Q3 change substantially? Why does this happen?

# Q8. Consider the following plot
plot(resid(fit.Educ),resid(lm(Fertility~Agriculture+Education+Catholic+Infant.Mortality,data=swiss)),
	xlab="resid(Educ Only)",ylab="resid(All but Exam)",
	xlim=c(-25,25),ylim=c(-25,25)) # Note that the last line makes the x and y axes the same
abline(a=0,b=1) # This adds a line with intercept = 0 and slope = 1.
# Why does this show the improved fit of the model with all but exam?


# Bonus note: You can get confidence intervals for estimated betas with the function 
# confint(fit) where fit is from a call to lm().