rm(list=ls()) data <- read.csv("~/AmRental.csv") head(data) # num bedrooms is a factor, so "plot" makes boxplots plot(Rent~num.bedrooms,data=data) # number of appartments for each level table(data$num.bedrooms) # sq ft vs number of bedrooms tapply(data$SqFt,data$num.bedrooms,mean,na.rm=T) # make studio the intercept data$num.bedrooms <- relevel(data$num.bedrooms,ref="Studio") # center SqFt around mean of Studio SqFt (to make itnercept more interpretable) data$SqFt <- data$SqFt-436.1 # fit some models fit.1 <- lm(Rent~num.bedrooms, data=data) fit.2 <- lm(Rent~SqFt, data=data) plot(data$SqFt,resid(fit.1)) plot(data$num.bedrooms,resid(fit.2)) fit.3 <- lm(Rent~num.bedrooms+SqFt, data=data) par(mfrow=c(1,2)) plot(data$SqFt,resid(fit.3)) plot(data$num.bedrooms,resid(fit.3)) par(mfrow=c(1,1)) plot(data$Util.Inc,resid(fit.3)) fit.4 <- lm(Rent~num.bedrooms*SqFt, data=data) # compare mode 3 and model 4 with an F test anova(fit.3,fit.4) plot(data$dist.to.Umass,resid(fit.4)) fit.5 <- lm(Rent~num.bedrooms*SqFt+dist.to.Umass, data=data,x=T) fit.temp <- lm(resid(fit.4)~data$dist.to.Umass) summary(fit.temp) abline(fit.temp,col="red") # partial regression plots fit.UM <- lm(dist.to.Umass~num.bedrooms*SqFt, data=data) plot(resid(fit.UM),resid(fit.4)) fit.partial <- lm(resid(fit.4)~resid(fit.UM)) # compare coef for resid(fit.UM) to coef for dist.to.Umass in fit.5: The same! X <- fit.5$x H <- X%*%solve(t(X)%*%X)%*%t(X) h <- diag(H) d <- resid(fit.5)/(1-h) plot(d) plot(rstandard(fit.5)) par(mfrow=c(2,3)) plot(fit.5,which=1:6) rstandard(fit.5) cooks.distance(fit.5) re.fit.5 <- lm(Rent~num.bedrooms*SqFt+dist.to.Umass, data=data[-c(126,127,131),],x=T)