Posted by : Ruilin
1970/01/01
manhattan_final
Ruilin
2023-06-19
Loading of data, removal of incomplete entries and dividing into train and test data
#install.packages("gridExtra")
library(gridExtra)
set.seed(302)
data=read.csv("manhattan.csv")
attach(data)
data1= na.omit(data)
rows <- sample(1:3539, 2800, replace=FALSE)
train<- data1[rows,]
test=data1[-rows,]
Summary of variables of the full model
summary(train[,c(3,4,5,6,2)])
## bedrooms bathrooms size_sqft min_to_subway
## Min. :0.000 Min. :0.000 Min. : 250.0 Min. : 0.00
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 611.8 1st Qu.: 2.00
## Median :1.000 Median :1.000 Median : 800.0 Median : 4.00
## Mean :1.358 Mean :1.372 Mean : 942.2 Mean : 5.01
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1150.2 3rd Qu.: 6.00
## Max. :5.000 Max. :5.000 Max. :3680.0 Max. :43.00
## rent
## Min. : 1300
## 1st Qu.: 3150
## Median : 4000
## Mean : 5167
## 3rd Qu.: 6000
## Max. :20000
summary(test[,c(3,4,5,6,2)])
## bedrooms bathrooms size_sqft min_to_subway
## Min. :0.000 Min. :1.000 Min. : 250.0 Min. : 0.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 616.5 1st Qu.: 2.000
## Median :1.000 Median :1.000 Median : 795.0 Median : 3.000
## Mean :1.329 Mean :1.348 Mean : 930.2 Mean : 4.824
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1100.0 3rd Qu.: 6.000
## Max. :5.000 Max. :4.000 Max. :4800.0 Max. :43.000
## rent
## Min. : 1443
## 1st Qu.: 3182
## Median : 3990
## Mean : 5033
## 3rd Qu.: 5995
## Max. :20000
Original EDA, histgrams,boxplots and scatterplots
attach(train)
## The following objects are masked from data:
##
## bathrooms, bedrooms, borough, building_age_yrs, floor,
## has_dishwasher, has_doorman, has_elevator, has_gym, has_patio,
## has_roofdeck, has_washer_dryer, min_to_subway, neighborhood,
## no_fee, rent, rental_id, size_sqft
par(mfrow=c(3,2))
hist(rent, breaks=10, main="Rent",col="#1793d1")
hist(bedrooms,breaks=10,xlab = "number of bedrooms", main="Number of Bedrooms")
hist(bathrooms, breaks=10,xlab = "number of bathrooms", main="Number of Bathrooms")
hist(size_sqft, breaks=20,xlab = "size", main="Size in sqft")
boxplot(min_to_subway, main = "Minutes to subway")
hist(building_age_yrs,xlab = "building age", main = "Building's age in years")
library(ggplot2)
a=ggplot(data=train, aes(x=bedrooms, y=rent)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
labs(x = 'Number of bedrooms', y='Rent',
title = 'Bedrooms VS Rent')
b=ggplot(data=train, aes(x=size_sqft, y=rent)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
labs(x = 'Size in sqft', y='Rent',
title = 'Unit VS rent')
grid.arrange(a,b, nrow=1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
powerTransform on y and EDA after transformation on y(rent)
#install.packages("car")
library(car)
## Loading required package: carData
transform <- powerTransform(cbind(data1$rent))
summary(transform)
## bcPower Transformation to Normality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1 -0.4932 -0.5 -0.5514 -0.4351
##
## Likelihood ratio test that transformation parameter is equal to 0
## (log transformation)
## LRT df pval
## LR test, lambda = (0) 283.5196 1 < 2.22e-16
##
## Likelihood ratio test that no transformation is needed
## LRT df pval
## LR test, lambda = (1) 2670.396 1 < 2.22e-16
train$logRent=log(train$rent)
attach(train)
## The following objects are masked from train (pos = 6):
##
## bathrooms, bedrooms, borough, building_age_yrs, floor,
## has_dishwasher, has_doorman, has_elevator, has_gym, has_patio,
## has_roofdeck, has_washer_dryer, min_to_subway, neighborhood,
## no_fee, rent, rental_id, size_sqft
## The following objects are masked from data:
##
## bathrooms, bedrooms, borough, building_age_yrs, floor,
## has_dishwasher, has_doorman, has_elevator, has_gym, has_patio,
## has_roofdeck, has_washer_dryer, min_to_subway, neighborhood,
## no_fee, rent, rental_id, size_sqft
par(mfrow=c(3,2))
hist(logRent, breaks=10, main="logRent",col="#1793d1")
hist(bedrooms,breaks=10,xlab = "number of bedrooms", main="Number of Bedrooms")
hist(bathrooms, breaks=10,xlab = "number of bathrooms", main="Number of Bathrooms")
hist(size_sqft, breaks=10,xlab = "size", main="Size in sqft")
boxplot(min_to_subway, main = "Minutes to subway")
hist(building_age_yrs, xlab = "building age",main = "Building's age in years")
library(ggplot2)
grid.arrange(a,b, nrow=1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
a=ggplot(data=train, aes(x=bedrooms, y=logRent)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
labs(x = 'Number of Bedrooms', y='logRent',
title = 'Bedrooms VS logRent')
b=ggplot(data=train, aes(x=size_sqft, y=logRent)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
labs(x = 'Size in sqft', y='logRent',
title = 'Size in sqft VS logRent')
grid.arrange(a,b, nrow=1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Model 1
library(kableExtra)
m1=lm(logRent~bedrooms+bathrooms+size_sqft+min_to_subway+building_age_yrs+floor+building_age_yrs+has_dishwasher+has_doorman+has_elevator+has_gym+has_roofdeck,data=train)
summary(m1)
##
## Call:
## lm(formula = logRent ~ bedrooms + bathrooms + size_sqft + min_to_subway +
## building_age_yrs + floor + building_age_yrs + has_dishwasher +
## has_doorman + has_elevator + has_gym + has_roofdeck, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.29499 -0.12838 -0.00938 0.13342 1.36153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.602e+00 1.613e-02 471.379 < 2e-16 ***
## bedrooms 2.028e-02 7.797e-03 2.601 0.00935 **
## bathrooms 1.086e-01 1.338e-02 8.119 6.99e-16 ***
## size_sqft 7.529e-04 1.846e-05 40.777 < 2e-16 ***
## min_to_subway -4.906e-03 8.327e-04 -5.892 4.29e-09 ***
## building_age_yrs -2.203e-03 1.290e-04 -17.081 < 2e-16 ***
## floor 4.335e-03 4.557e-04 9.511 < 2e-16 ***
## has_dishwasher 1.500e-02 1.285e-02 1.167 0.24343
## has_doorman -6.726e-03 1.519e-02 -0.443 0.65795
## has_elevator 2.253e-02 1.577e-02 1.429 0.15322
## has_gym -2.108e-02 1.707e-02 -1.235 0.21700
## has_roofdeck 9.702e-03 1.568e-02 0.619 0.53610
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2392 on 2788 degrees of freedom
## Multiple R-squared: 0.7807, Adjusted R-squared: 0.7799
## F-statistic: 902.4 on 11 and 2788 DF, p-value: < 2.2e-16
Model 2, AIC, BIC, ajusted R square checking
m2=lm(logRent~bedrooms+bathrooms+size_sqft+min_to_subway+building_age_yrs+floor+building_age_yrs,data=train)
summary(m2)
##
## Call:
## lm(formula = logRent ~ bedrooms + bathrooms + size_sqft + min_to_subway +
## building_age_yrs + floor + building_age_yrs, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.30270 -0.12802 -0.00836 0.13231 1.37624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.608e+00 1.585e-02 479.942 < 2e-16 ***
## bedrooms 1.944e-02 7.780e-03 2.499 0.0125 *
## bathrooms 1.084e-01 1.337e-02 8.105 7.83e-16 ***
## size_sqft 7.552e-04 1.843e-05 40.973 < 2e-16 ***
## min_to_subway -4.934e-03 8.324e-04 -5.928 3.44e-09 ***
## building_age_yrs -2.208e-03 1.289e-04 -17.134 < 2e-16 ***
## floor 4.317e-03 4.536e-04 9.517 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2392 on 2793 degrees of freedom
## Multiple R-squared: 0.7803, Adjusted R-squared: 0.7798
## F-statistic: 1653 on 6 and 2793 DF, p-value: < 2.2e-16
#SSres, adjusted R square, AIC, AICc, BIC check
summary(m2)
##
## Call:
## lm(formula = logRent ~ bedrooms + bathrooms + size_sqft + min_to_subway +
## building_age_yrs + floor + building_age_yrs, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.30270 -0.12802 -0.00836 0.13231 1.37624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.608e+00 1.585e-02 479.942 < 2e-16 ***
## bedrooms 1.944e-02 7.780e-03 2.499 0.0125 *
## bathrooms 1.084e-01 1.337e-02 8.105 7.83e-16 ***
## size_sqft 7.552e-04 1.843e-05 40.973 < 2e-16 ***
## min_to_subway -4.934e-03 8.324e-04 -5.928 3.44e-09 ***
## building_age_yrs -2.208e-03 1.289e-04 -17.134 < 2e-16 ***
## floor 4.317e-03 4.536e-04 9.517 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2392 on 2793 degrees of freedom
## Multiple R-squared: 0.7803, Adjusted R-squared: 0.7798
## F-statistic: 1653 on 6 and 2793 DF, p-value: < 2.2e-16
summary(m1)
##
## Call:
## lm(formula = logRent ~ bedrooms + bathrooms + size_sqft + min_to_subway +
## building_age_yrs + floor + building_age_yrs + has_dishwasher +
## has_doorman + has_elevator + has_gym + has_roofdeck, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.29499 -0.12838 -0.00938 0.13342 1.36153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.602e+00 1.613e-02 471.379 < 2e-16 ***
## bedrooms 2.028e-02 7.797e-03 2.601 0.00935 **
## bathrooms 1.086e-01 1.338e-02 8.119 6.99e-16 ***
## size_sqft 7.529e-04 1.846e-05 40.777 < 2e-16 ***
## min_to_subway -4.906e-03 8.327e-04 -5.892 4.29e-09 ***
## building_age_yrs -2.203e-03 1.290e-04 -17.081 < 2e-16 ***
## floor 4.335e-03 4.557e-04 9.511 < 2e-16 ***
## has_dishwasher 1.500e-02 1.285e-02 1.167 0.24343
## has_doorman -6.726e-03 1.519e-02 -0.443 0.65795
## has_elevator 2.253e-02 1.577e-02 1.429 0.15322
## has_gym -2.108e-02 1.707e-02 -1.235 0.21700
## has_roofdeck 9.702e-03 1.568e-02 0.619 0.53610
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2392 on 2788 degrees of freedom
## Multiple R-squared: 0.7807, Adjusted R-squared: 0.7799
## F-statistic: 902.4 on 11 and 2788 DF, p-value: < 2.2e-16
#multicollinearity check
vif(m2)
## bedrooms bathrooms size_sqft min_to_subway
## 2.777873 3.231427 3.768605 1.047040
## building_age_yrs floor
## 1.246706 1.197529
Partial model, model 3 and paritial f test
m3=lm(logRent~bedrooms+size_sqft+min_to_subway+building_age_yrs,data=train)
anova(m2,m3)
condition check for model 2 with plots
r <- resid(m2)
#condition 1
plot(rent ~ fitted(m2), main="Y versus Y-hat", xlab="logRent-hat", ylab="logRent")
abline(a = 0, b = 1)
lines(lowess(rent ~ fitted(m2)), lty=2)
#condition 2
data2 = data.frame(train$rent, train$bedrooms, train$size_sqft)
pairs( data2 )
4 assumption checks of model 2 using residual plots, qqplots. cook distance plots
par(mfrow=c(2,2))
##residual vs fitted plot
plot(m2,1)
##qqplot
plot(m2,2)
##residual vs X plots
plot(train$bedrooms, r, xlab="Number of bedrooms", ylab="Residuals", main="Residuals vs x1")
plot(train$bathrooms, r, xlab="Number of bathrooms", ylab="Residuals", main="Residuals vs x2")
par(mfrow=c(2,2))
plot(train$size_sqft, r, xlab="Size", ylab="Residuals", main="Residuals vs x3")
plot(train$min_to_subway, r, xlab="Minutes to subway", ylab="Residuals", main="Residuals vs x4")
plot(train$building_age_yrs, r, xlab="Building age", ylab="Residuals", main="Residuals vs x5")
plot(train$floor, r, xlab="Floor", ylab="Residuals", main="Residuals vs x6")
par(mfrow=c(2,2))
##cook distance plot
plot(m2,4)
outlier points, leverage points, influential points, and multicollinearity check
#outlier points
r <- rstandard(m2)
out <- which(r > 2 | r < -2)
out
## 618 1881 2176 2557 2090 2801 185 3220 851 3232 1760 1346 3186 1566 384 1089
## 27 33 36 53 83 93 98 102 147 149 161 165 174 246 266 319
## 49 919 671 2364 638 1459 1560 3363 2856 314 338 1960 2097 732 1254 1908
## 325 329 392 407 410 441 447 452 480 500 519 529 538 560 566 570
## 1905 2852 854 716 281 578 3279 72 201 3067 378 1170 3163 2371 94 1313
## 574 585 599 635 705 707 708 711 714 715 726 735 803 814 836 856
## 1558 2635 2095 2824 2668 3478 1305 1884 2577 1808 2960 2118 992 277 3258 2811
## 857 862 870 908 940 958 992 993 1013 1025 1037 1039 1060 1063 1090 1109
## 3483 2799 2496 3277 931 455 1502 1334 727 758 1316 2073 1767 538 3435 2401
## 1115 1116 1121 1129 1140 1141 1148 1164 1184 1201 1210 1212 1228 1229 1243 1255
## 2449 1417 2861 1226 3097 1399 290 1998 645 436 228 1845 3293 3133 1184 1583
## 1278 1319 1330 1352 1380 1383 1409 1431 1464 1502 1537 1540 1618 1649 1662 1671
## 3522 2671 628 354 1329 3418 2922 508 1510 402 981 529 1152 938 208 3255
## 1680 1696 1708 1726 1750 1759 1770 1771 1773 1782 1785 1833 1844 1855 1856 1859
## 657 652 2269 553 552 1512 1921 3300 70 1894 2455 1100 3491 2728 1231 1586
## 1872 1886 1894 1906 1921 1925 1989 1998 2001 2004 2055 2115 2128 2151 2161 2162
## 312 3469 25 2459 3026 3381 2394 2948 946 1930 2885 2749 2641 3353 764 128
## 2221 2306 2309 2314 2327 2332 2333 2346 2356 2367 2369 2373 2378 2385 2396 2415
## 1204 2677 3100 3432 1661 1052 3040 3446 2035 2167 2291 2737 2786 614 1736 146
## 2419 2479 2484 2491 2495 2527 2541 2553 2573 2591 2605 2607 2616 2646 2648 2653
## 2387 1251 1437 2698 74 899 2778 471
## 2660 2662 2665 2742 2750 2752 2781 2787
#leverage points
h <- hatvalues(m2)
threshold <- 2 * (length(m2$coefficients)/nrow(train))
w <- which(h > threshold)
train[w,]
#influential points
D <- cooks.distance(m2)
cutoff <- qf(0.5, length(m2$coefficients), nrow(train)-length(m2$coefficients), lower.tail=T)
which(D > cutoff)
## named integer(0)
fitting model(model 5) using test data and its plots, adjusted R square, AIC, BIC
test$logRent=log(test$rent)
m5=lm(logRent~bedrooms+bathrooms+size_sqft+min_to_subway+building_age_yrs+floor+building_age_yrs,data=test)
summary(m5)
##
## Call:
## lm(formula = logRent ~ bedrooms + bathrooms + size_sqft + min_to_subway +
## building_age_yrs + floor + building_age_yrs, data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93733 -0.13083 -0.00126 0.12749 0.89563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.533e+00 3.118e-02 241.624 < 2e-16 ***
## bedrooms 3.514e-02 1.440e-02 2.440 0.0149 *
## bathrooms 2.352e-01 2.689e-02 8.746 < 2e-16 ***
## size_sqft 5.648e-04 3.377e-05 16.729 < 2e-16 ***
## min_to_subway -3.342e-03 1.697e-03 -1.969 0.0493 *
## building_age_yrs -1.741e-03 2.509e-04 -6.937 8.82e-12 ***
## floor 6.293e-03 8.749e-04 7.193 1.58e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.241 on 732 degrees of freedom
## Multiple R-squared: 0.7618, Adjusted R-squared: 0.7599
## F-statistic: 390.2 on 6 and 732 DF, p-value: < 2.2e-16
# Plots to check conditions
r <- resid(m5)
#condition 1
plot(test$rent ~ fitted(m5), main="Y versus Y-hat", xlab="logRent-hat", ylab="logRent")
abline(a = 0, b = 1)
lines(lowess(test$rent ~ fitted(m5)), lty=2)
#condition 2
data2 = data.frame(test$rent, test$bedrooms,test$size_sqft)
pairs( data2 )
#Plots to check assumptions
par(mfrow=c(2,2))
##residual vs fitted
plot(m5,1)
##qqplot
plot(m5,2)
##residual vs X
plot(test$bedrooms, r, xlab="Number of bedrooms", ylab="Residuals", main="Residuals vs x1")
plot(test$bathrooms, r, xlab="Number of bathrooms", ylab="Residuals", main="Residuals vs x2")
par(mfrow=c(2,2))
plot(test$size_sqft, r, xlab="Size", ylab="Residuals", main="Residuals vs x3")
plot(test$min_to_subway, r, xlab="Minutes to subway", ylab="Residuals", main="Residuals vs x4")
plot(test$building_age_yrs, r, xlab="Building age", ylab="Residuals", main="Residuals vs x5")
plot(test$floor, r, xlab="Floor", ylab="Residuals", main="Residuals vs x6")
par(mfrow=c(2,2))
#Extra:cook distance
plot(m5,4)
#SSres, adjusted R square, AIC, AICc, BIC check
summary(m2)
##
## Call:
## lm(formula = logRent ~ bedrooms + bathrooms + size_sqft + min_to_subway +
## building_age_yrs + floor + building_age_yrs, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.30270 -0.12802 -0.00836 0.13231 1.37624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.608e+00 1.585e-02 479.942 < 2e-16 ***
## bedrooms 1.944e-02 7.780e-03 2.499 0.0125 *
## bathrooms 1.084e-01 1.337e-02 8.105 7.83e-16 ***
## size_sqft 7.552e-04 1.843e-05 40.973 < 2e-16 ***
## min_to_subway -4.934e-03 8.324e-04 -5.928 3.44e-09 ***
## building_age_yrs -2.208e-03 1.289e-04 -17.134 < 2e-16 ***
## floor 4.317e-03 4.536e-04 9.517 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2392 on 2793 degrees of freedom
## Multiple R-squared: 0.7803, Adjusted R-squared: 0.7798
## F-statistic: 1653 on 6 and 2793 DF, p-value: < 2.2e-16
summary(m5)
##
## Call:
## lm(formula = logRent ~ bedrooms + bathrooms + size_sqft + min_to_subway +
## building_age_yrs + floor + building_age_yrs, data = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93733 -0.13083 -0.00126 0.12749 0.89563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.533e+00 3.118e-02 241.624 < 2e-16 ***
## bedrooms 3.514e-02 1.440e-02 2.440 0.0149 *
## bathrooms 2.352e-01 2.689e-02 8.746 < 2e-16 ***
## size_sqft 5.648e-04 3.377e-05 16.729 < 2e-16 ***
## min_to_subway -3.342e-03 1.697e-03 -1.969 0.0493 *
## building_age_yrs -1.741e-03 2.509e-04 -6.937 8.82e-12 ***
## floor 6.293e-03 8.749e-04 7.193 1.58e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.241 on 732 degrees of freedom
## Multiple R-squared: 0.7618, Adjusted R-squared: 0.7599
## F-statistic: 390.2 on 6 and 732 DF, p-value: < 2.2e-16