rm(list=ls())

data <- read.csv("~/AmRental.csv")

head(data)

# num bedrooms is a factor, so "plot" makes boxplots
plot(Rent~num.bedrooms,data=data)

# number of appartments for each level
table(data$num.bedrooms)

# sq ft vs number of bedrooms
tapply(data$SqFt,data$num.bedrooms,mean,na.rm=T)

# make studio the intercept
data$num.bedrooms <- relevel(data$num.bedrooms,ref="Studio")

# center SqFt around mean of Studio SqFt (to make itnercept more interpretable)
data$SqFt <- data$SqFt-436.1

# fit some models
fit.1 <- lm(Rent~num.bedrooms, data=data)
fit.2 <- lm(Rent~SqFt, data=data)

plot(data$SqFt,resid(fit.1))
plot(data$num.bedrooms,resid(fit.2))
fit.3 <- lm(Rent~num.bedrooms+SqFt, data=data)
par(mfrow=c(1,2))
plot(data$SqFt,resid(fit.3))
plot(data$num.bedrooms,resid(fit.3))


par(mfrow=c(1,1))
plot(data$Util.Inc,resid(fit.3))

fit.4 <- lm(Rent~num.bedrooms*SqFt, data=data)

# compare mode 3 and model 4 with an F test
anova(fit.3,fit.4)


plot(data$dist.to.Umass,resid(fit.4))

fit.5 <- lm(Rent~num.bedrooms*SqFt+dist.to.Umass, data=data,x=T)

fit.temp <- lm(resid(fit.4)~data$dist.to.Umass)
summary(fit.temp)
abline(fit.temp,col="red")

# partial regression plots
fit.UM <- lm(dist.to.Umass~num.bedrooms*SqFt, data=data)
plot(resid(fit.UM),resid(fit.4))
fit.partial <- lm(resid(fit.4)~resid(fit.UM))

# compare coef for resid(fit.UM) to coef for dist.to.Umass in fit.5: The same!


X <- fit.5$x
H <- X%*%solve(t(X)%*%X)%*%t(X)
h <- diag(H)
d <- resid(fit.5)/(1-h)
plot(d)

plot(rstandard(fit.5))
par(mfrow=c(2,3))
plot(fit.5,which=1:6)

rstandard(fit.5)
cooks.distance(fit.5)

re.fit.5 <- lm(Rent~num.bedrooms*SqFt+dist.to.Umass, 
								data=data[-c(126,127,131),],x=T)