# clear the memory rm(list=ls()) # load the "swiss" dataset (built into R) data(swiss) # look at the data pairs(swiss) head(swiss) # Please type ?swiss # and read about the dataset. # Our goal is to understand how the covariates # # Agriculture, # Examination, # Education, # Catholic, and # Infant.Mortality # # are related to Fertility. # Q1. Use either the length() or dim() function to figure out what the sample # size is. # Let's make a table of information about 5 one-factor models. model.summaries <- data.frame(vars=c("Agriculture","Examination", "Education","Catholic","Infant.Mortality"), MSE=NA, coef.estimate=NA) model.summaries # fit all 5 models and fill in the table for (i in 1:5) { # make the formula model.formula <- paste("Fertility~",model.summaries$vars[i]) fit <- lm(model.formula,data=swiss) # look at the anova print(anova(fit)) # look at estimates print(summary(fit)) # fill in the table model.summaries$MSE[i] <- anova(fit)[2,3] model.summaries$coef.estimate[i] <- fit$coef[2] } # Q2. Which covariates make the unexplained variance in fertility the # smallest (in 1 factor model)? What criteria did you use to answer this # question? # Q3. Summarize the relationships between each covariate and fertility. # Q4. Two covariates from Q2 resulted in relatively small MSEs. # What happens when both of them are used in a multiple # regression as covariates? Please explain why this happens. # Q5. The percent of males involved in agriculture as an occupation # appears to be positively associated with Fertility. What happens to # the effect of Agriculture when Education is added to the model? Please # explain why. # Q6. Consider the following plots. par(mfrow=c(2,2)) fit.Educ <- lm(Fertility~Education,data=swiss) plot(swiss$Agri,resid(fit.Educ),xlab="Agriculture",ylab="Resid(Educ only model)") plot(swiss$Cath,resid(fit.Educ),xlab="Catholic",ylab="Resid(Educ only model)") plot(swiss$Exam,resid(fit.Educ),xlab="Examination",ylab="Resid(Educ only model)") plot(swiss$Infa,resid(fit.Educ),xlab="Infant Mortality",ylab="Resid(Educ only model)") # Which models would you want to consider next? Why? # Q7. Consider a model that uses all the covariates except for examination. # In what sense is this model better than the models you have seen so far? # Do any of the relationships you found in Q3 change substantially? Why does this happen? # Q8. Consider the following plot plot(resid(fit.Educ),resid(lm(Fertility~Agriculture+Education+Catholic+Infant.Mortality,data=swiss)), xlab="resid(Educ Only)",ylab="resid(All but Exam)", xlim=c(-25,25),ylim=c(-25,25)) # Note that the last line makes the x and y axes the same abline(a=0,b=1) # This adds a line with intercept = 0 and slope = 1. # Why does this show the improved fit of the model with all but exam? # Bonus note: You can get confidence intervals for estimated betas with the function # confint(fit) where fit is from a call to lm().