Autores: Davi Toshio e Henrique Cursino Vieira
Carregando as bibliotecas
library(dplyr)
library(ggplot2)
library(tidyr)
library(readr)
library(caret)
library(ranger)
library(rpart)
library(rpart.plot)
library(corrplot)
library(statisticalModeling)
library(mlbench)
library(e1071)
Este kernel é uma aplicação prática de análise exploratória de dados e aprendizado de máquina utilizando a linguagem R. Aqui analisamos os dados do desafio “House Prices: Advanced Regression /Techniques”, do repositório Kaggle.
Link: https://www.kaggle.com/c/house-prices-advanced-regression-techniques
1.Pré-tratamento dos dados
1.2.Carregando teste e treino
setwd('/home/davi/Desktop/house_prices-_advanced_regression_techniques/')
train <- read.csv(file = 'train.csv', sep= ',', stringsAsFactors = F)
test <- read.csv(file = 'test.csv', sep= ',', stringsAsFactors = F)
# Adicionando coluna SalePrice no conjunto teste
test$SalePrice <- NA
# Criando coluna index do data set
train$dataset <- 'train'
test$dataset <- 'test'
# Juntando dataset, melhora
joined_data <- rbind(train, test)
1.3.Definição de tipos e eliminação de variáveis
# Remove coluna Utilities (super representatividade de um fator)
which(colnames(joined_data) == 'Utilities')
## [1] 10
joined_data <- joined_data[ ,-10]
# Outlier data$GarageYrBlt 2207 to 2007
joined_data$GarageYrBlt[which(joined_data$GarageYrBlt == 2207)] <- 2007
# Ajustando conteudo de variáveis onde NA é um factor
joined_data$Alley[which(is.na(joined_data$Alley))] <- 'no_alley_access'
joined_data$BsmtQual[which(is.na(joined_data$BsmtQual))] <- 'no_basement'
joined_data$BsmtCond[which(is.na(joined_data$BsmtCond))] <- 'no_basement'
joined_data$BsmtFinType1[which(is.na(joined_data$BsmtFinType1))] <- 'no_basement'
joined_data$BsmtFinType2[which(is.na(joined_data$BsmtFinType2))] <- 'no_basement'
joined_data$PoolQC[which(is.na(joined_data$PoolQC))] <- 'no_pool'
joined_data$Fence[which(is.na(joined_data$Fence))] <- 'no_fence'
joined_data$MiscFeature[which(is.na(joined_data$MiscFeature))] <- 'none'
joined_data$BsmtExposure[which(is.na(joined_data$BsmtExposure))] <- 'no_basemente'
joined_data$FireplaceQu[which(is.na(joined_data$FireplaceQu))] <- 'no_fireplace'
joined_data$GarageType[which(is.na(joined_data$GarageType))] <- 'no_garage'
joined_data$GarageFinish[which(is.na(joined_data$GarageFinish))] <- 'no_garage'
joined_data$GarageQual[which(is.na(joined_data$GarageQual))] <- 'no_garage'
joined_data$GarageCond[which(is.na(joined_data$GarageCond))] <- 'no_garage'
# Definindo e convertendo as variaveis categoricas (factors)
joined_data$MSSubClass <- as.factor(joined_data$MSSubClass)
joined_data$OverallQual <- as.factor(joined_data$OverallQual)
joined_data$OverallCond <- as.factor(joined_data$OverallCond)
joined_data$GarageYrBlt <- as.factor(joined_data$GarageYrBlt)
joined_data$YrSold <- as.factor(joined_data$YrSold)
joined_data$MoSold <- as.factor(joined_data$MoSold)
joined_data$GarageYrBlt <- as.factor(joined_data$GarageYrBlt)
joined_data$YearBuilt <- as.factor(joined_data$YearBuilt)
joined_data$YearRemodAdd <- as.factor(joined_data$YearRemodAdd)
# Coverte caracteres para factors
joined_data[ ,which(sapply(joined_data, is.character))] <- lapply(joined_data[ ,which(sapply(joined_data, is.character))], as.factor)
# BsmtFinSF1
na_position <- which(is.na(joined_data$BsmtFinSF1))
# Via de regra, para todo no_basement nas variaveis Bsmt, o valor e zero!
joined_data %>% select(contains('Bsmt')) %>%
filter( BsmtFinType1== 'no_basement') %>%
summary()
## BsmtQual BsmtCond BsmtExposure BsmtFinType1
## Ex : 0 Fa : 0 Av : 0 ALQ : 0
## Fa : 0 Gd : 0 Gd : 0 BLQ : 0
## Gd : 0 no_basement:79 Mn : 0 GLQ : 0
## no_basement:79 Po : 0 No : 0 LwQ : 0
## TA : 0 TA : 0 no_basemente:79 no_basement:79
## Rec : 0
## Unf : 0
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## Min. :0 ALQ : 0 Min. :0 Min. :0 Min. :0
## 1st Qu.:0 BLQ : 0 1st Qu.:0 1st Qu.:0 1st Qu.:0
## Median :0 GLQ : 0 Median :0 Median :0 Median :0
## Mean :0 LwQ : 0 Mean :0 Mean :0 Mean :0
## 3rd Qu.:0 no_basement:79 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0
## Max. :0 Rec : 0 Max. :0 Max. :0 Max. :0
## NA's :1 Unf : 0 NA's :1 NA's :1 NA's :1
## BsmtFullBath BsmtHalfBath
## Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0
## Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
## NA's :2 NA's :2
# BsmtFinSF1
joined_data$BsmtFinSF1[which(is.na(joined_data$BsmtFinSF1))] <- 0
# BsmtFinSF2
joined_data$BsmtFinSF2[which(is.na(joined_data$BsmtFinSF2))] <- 0
# BsmtUnfSF
joined_data$BsmtUnfSF[which(is.na(joined_data$BsmtUnfSF))] <- 0
# TotalBsmtSF
joined_data$TotalBsmtSF[which(is.na(joined_data$TotalBsmtSF))] <- 0
# BsmtFullBath
joined_data$BsmtFullBath[which(is.na(joined_data$BsmtFullBath))] <- 0
# BsmtHalfBath
joined_data$BsmtHalfBath[which(is.na(joined_data$BsmtHalfBath))] <- 0
# GarageCars
# Igual ao problema anterior, vemos que nas demais variaveis relacionada
# Temos 'no_garage', logo podemos colocar zero quando nao existir informacao
joined_data$GarageCars[which(is.na(joined_data$GarageCars))] <- 0
# GarageArea
joined_data$GarageArea[which(is.na(joined_data$GarageArea))] <- 0
# GarageYrBlt
# Variavel troll: onde tem NA, esta relacionado com no_garage ... --'
joined_data$GarageYrBlt <- as.character(joined_data$GarageYrBlt)
joined_data$GarageYrBlt[which(is.na(joined_data$GarageYrBlt))] <- 'no_garage'
joined_data$GarageYrBlt <- as.factor(joined_data$GarageYrBlt)
# Separa coluna de factors para analisar
col_factor <- which(sapply(joined_data, is.factor))
joined_data_factors <- joined_data[ , col_factor]
# Separa coluna de numericos
col_numeric <- which(sapply(joined_data, is.numeric))
names(joined_data[,col_numeric])
## [1] "Id" "LotFrontage" "LotArea" "MasVnrArea"
## [5] "BsmtFinSF1" "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF"
## [9] "X1stFlrSF" "X2ndFlrSF" "LowQualFinSF" "GrLivArea"
## [13] "BsmtFullBath" "BsmtHalfBath" "FullBath" "HalfBath"
## [17] "BedroomAbvGr" "KitchenAbvGr" "TotRmsAbvGrd" "Fireplaces"
## [21] "GarageCars" "GarageArea" "WoodDeckSF" "OpenPorchSF"
## [25] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [29] "MiscVal" "SalePrice"
1.4.Distribuição dos factors nas variável categóricas
Uma quantidade significativa das variáveis categóricas estão bem desproporcionais … :( Este é um problema que precisa ser melhor trabalhado futuramente. Elas são informativas, porém podem causar problema em nosso modelo.
# Coluna SalePrice: 32
list_of_var_counts <- apply(joined_data_factors, 2, function(x){
table <- data.frame(table(x), as.numeric(table(x)/sum(table(x))))
colnames(table) <- c('factor_type', 'raw_count', 'percentage')
return(table)
})
list_of_var_counts
## $MSSubClass
## factor_type raw_count percentage
## 1 120 182 0.0623501199
## 2 150 1 0.0003425831
## 3 160 128 0.0438506338
## 4 180 17 0.0058239123
## 5 190 61 0.0208975677
## 6 20 1079 0.3696471394
## 7 30 139 0.0476190476
## 8 40 6 0.0020554985
## 9 45 18 0.0061664954
## 10 50 287 0.0983213429
## 11 60 575 0.1969852689
## 12 70 128 0.0438506338
## 13 75 23 0.0078794108
## 14 80 118 0.0404248030
## 15 85 48 0.0164439877
## 16 90 109 0.0373415553
##
## $MSZoning
## factor_type raw_count percentage
## 1 C (all) 25 0.008576329
## 2 FV 139 0.047684391
## 3 RH 26 0.008919383
## 4 RL 2265 0.777015437
## 5 RM 460 0.157804460
##
## $Street
## factor_type raw_count percentage
## 1 Grvl 12 0.004110997
## 2 Pave 2907 0.995889003
##
## $Alley
## factor_type raw_count percentage
## 1 Grvl 120 0.04110997
## 2 no_alley_access 2721 0.93216855
## 3 Pave 78 0.02672148
##
## $LotShape
## factor_type raw_count percentage
## 1 IR1 968 0.331620418
## 2 IR2 76 0.026036314
## 3 IR3 16 0.005481329
## 4 Reg 1859 0.636861939
##
## $LandContour
## factor_type raw_count percentage
## 1 Bnk 117 0.04008222
## 2 HLS 120 0.04110997
## 3 Low 60 0.02055498
## 4 Lvl 2622 0.89825283
##
## $LotConfig
## factor_type raw_count percentage
## 1 Corner 511 0.175059952
## 2 CulDSac 176 0.060294621
## 3 FR2 85 0.029119561
## 4 FR3 14 0.004796163
## 5 Inside 2133 0.730729702
##
## $LandSlope
## factor_type raw_count percentage
## 1 Gtl 2778 0.951695786
## 2 Mod 125 0.042822885
## 3 Sev 16 0.005481329
##
## $Neighborhood
## factor_type raw_count percentage
## 1 Blmngtn 28 0.009592326
## 2 Blueste 10 0.003425831
## 3 BrDale 30 0.010277492
## 4 BrkSide 108 0.036998972
## 5 ClearCr 44 0.015073655
## 6 CollgCr 267 0.091469681
## 7 Crawfor 103 0.035286057
## 8 Edwards 194 0.066461117
## 9 Gilbert 165 0.056526208
## 10 IDOTRR 93 0.031860226
## 11 MeadowV 37 0.012675574
## 12 Mitchel 114 0.039054471
## 13 NAmes 443 0.151764303
## 14 NoRidge 71 0.024323398
## 15 NPkVill 23 0.007879411
## 16 NridgHt 166 0.056868791
## 17 NWAmes 131 0.044878383
## 18 OldTown 239 0.081877355
## 19 Sawyer 151 0.051730045
## 20 SawyerW 125 0.042822885
## 21 Somerst 182 0.062350120
## 22 StoneBr 51 0.017471737
## 23 SWISU 48 0.016443988
## 24 Timber 72 0.024665982
## 25 Veenker 24 0.008221994
##
## $Condition1
## factor_type raw_count percentage
## 1 Artery 92 0.031517643
## 2 Feedr 164 0.056183625
## 3 Norm 2511 0.860226105
## 4 PosA 20 0.006851662
## 5 PosN 39 0.013360740
## 6 RRAe 28 0.009592326
## 7 RRAn 50 0.017129154
## 8 RRNe 6 0.002055498
## 9 RRNn 9 0.003083248
##
## $Condition2
## factor_type raw_count percentage
## 1 Artery 5 0.0017129154
## 2 Feedr 13 0.0044535800
## 3 Norm 2889 0.9897225077
## 4 PosA 4 0.0013703323
## 5 PosN 4 0.0013703323
## 6 RRAe 1 0.0003425831
## 7 RRAn 1 0.0003425831
## 8 RRNn 2 0.0006851662
##
## $BldgType
## factor_type raw_count percentage
## 1 1Fam 2425 0.83076396
## 2 2fmCon 62 0.02124015
## 3 Duplex 109 0.03734156
## 4 Twnhs 96 0.03288798
## 5 TwnhsE 227 0.07776636
##
## $HouseStyle
## factor_type raw_count percentage
## 1 1.5Fin 314 0.107571086
## 2 1.5Unf 19 0.006509078
## 3 1Story 1471 0.503939705
## 4 2.5Fin 8 0.002740665
## 5 2.5Unf 24 0.008221994
## 6 2Story 872 0.298732443
## 7 SFoyer 83 0.028434395
## 8 SLvl 128 0.043850634
##
## $OverallQual
## factor_type raw_count percentage
## 1 1 4 0.001370332
## 2 10 31 0.010620075
## 3 2 13 0.004453580
## 4 3 40 0.013703323
## 5 4 226 0.077423775
## 6 5 825 0.282631038
## 7 6 731 0.250428229
## 8 7 600 0.205549846
## 9 8 342 0.117163412
## 10 9 107 0.036656389
##
## $OverallCond
## factor_type raw_count percentage
## 1 1 7 0.002398082
## 2 2 10 0.003425831
## 3 3 50 0.017129154
## 4 4 101 0.034600891
## 5 5 1645 0.563549161
## 6 6 531 0.181911614
## 7 7 390 0.133607400
## 8 8 144 0.049331963
## 9 9 41 0.014045906
##
## $YearBuilt
## factor_type raw_count percentage
## 1 1872 1 0.0003425831
## 2 1875 1 0.0003425831
## 3 1879 1 0.0003425831
## 4 1880 5 0.0017129154
## 5 1882 1 0.0003425831
## 6 1885 2 0.0006851662
## 7 1890 7 0.0023980815
## 8 1892 2 0.0006851662
## 9 1893 1 0.0003425831
## 10 1895 3 0.0010277492
## 11 1896 1 0.0003425831
## 12 1898 1 0.0003425831
## 13 1900 29 0.0099349092
## 14 1901 2 0.0006851662
## 15 1902 1 0.0003425831
## 16 1904 1 0.0003425831
## 17 1905 3 0.0010277492
## 18 1906 1 0.0003425831
## 19 1907 1 0.0003425831
## 20 1908 2 0.0006851662
## 21 1910 43 0.0147310723
## 22 1911 1 0.0003425831
## 23 1912 5 0.0017129154
## 24 1913 1 0.0003425831
## 25 1914 8 0.0027406646
## 26 1915 24 0.0082219938
## 27 1916 10 0.0034258308
## 28 1917 3 0.0010277492
## 29 1918 10 0.0034258308
## 30 1919 5 0.0017129154
## 31 1920 57 0.0195272354
## 32 1921 11 0.0037684138
## 33 1922 16 0.0054813292
## 34 1923 17 0.0058239123
## 35 1924 16 0.0054813292
## 36 1925 34 0.0116478246
## 37 1926 19 0.0065090785
## 38 1927 9 0.0030832477
## 39 1928 9 0.0030832477
## 40 1929 8 0.0027406646
## 41 1930 26 0.0089071600
## 42 1931 7 0.0023980815
## 43 1932 5 0.0017129154
## 44 1934 5 0.0017129154
## 45 1935 13 0.0044535800
## 46 1936 11 0.0037684138
## 47 1937 9 0.0030832477
## 48 1938 13 0.0044535800
## 49 1939 20 0.0068516615
## 50 1940 36 0.0123329908
## 51 1941 23 0.0078794108
## 52 1942 6 0.0020554985
## 53 1945 15 0.0051387461
## 54 1946 15 0.0051387461
## 55 1947 11 0.0037684138
## 56 1948 27 0.0092497431
## 57 1949 18 0.0061664954
## 58 1950 38 0.0130181569
## 59 1951 18 0.0061664954
## 60 1952 18 0.0061664954
## 61 1953 24 0.0082219938
## 62 1954 43 0.0147310723
## 63 1955 34 0.0116478246
## 64 1956 39 0.0133607400
## 65 1957 35 0.0119904077
## 66 1958 48 0.0164439877
## 67 1959 43 0.0147310723
## 68 1960 37 0.0126755738
## 69 1961 34 0.0116478246
## 70 1962 35 0.0119904077
## 71 1963 35 0.0119904077
## 72 1964 33 0.0113052415
## 73 1965 34 0.0116478246
## 74 1966 35 0.0119904077
## 75 1967 41 0.0140459061
## 76 1968 45 0.0154162384
## 77 1969 28 0.0095923261
## 78 1970 42 0.0143884892
## 79 1971 39 0.0133607400
## 80 1972 40 0.0137033231
## 81 1973 21 0.0071942446
## 82 1974 23 0.0078794108
## 83 1975 25 0.0085645769
## 84 1976 54 0.0184994861
## 85 1977 57 0.0195272354
## 86 1978 39 0.0133607400
## 87 1979 21 0.0071942446
## 88 1980 23 0.0078794108
## 89 1981 9 0.0030832477
## 90 1982 7 0.0023980815
## 91 1983 8 0.0027406646
## 92 1984 19 0.0065090785
## 93 1985 7 0.0023980815
## 94 1986 10 0.0034258308
## 95 1987 8 0.0027406646
## 96 1988 15 0.0051387461
## 97 1989 8 0.0027406646
## 98 1990 19 0.0065090785
## 99 1991 12 0.0041109969
## 100 1992 27 0.0092497431
## 101 1993 39 0.0133607400
## 102 1994 37 0.0126755738
## 103 1995 31 0.0106200754
## 104 1996 34 0.0116478246
## 105 1997 35 0.0119904077
## 106 1998 46 0.0157588215
## 107 1999 52 0.0178143200
## 108 2000 48 0.0164439877
## 109 2001 35 0.0119904077
## 110 2002 47 0.0161014046
## 111 2003 88 0.0301473107
## 112 2004 99 0.0339157246
## 113 2005 142 0.0486467968
## 114 2006 138 0.0472764645
## 115 2007 109 0.0373415553
## 116 2008 49 0.0167865707
## 117 2009 25 0.0085645769
## 118 2010 3 0.0010277492
##
## $YearRemodAdd
## factor_type raw_count percentage
## 1 1950 361 0.123672491
## 2 1951 14 0.004796163
## 3 1952 15 0.005138746
## 4 1953 20 0.006851662
## 5 1954 28 0.009592326
## 6 1955 25 0.008564577
## 7 1956 30 0.010277492
## 8 1957 20 0.006851662
## 9 1958 34 0.011647825
## 10 1959 30 0.010277492
## 11 1960 29 0.009934909
## 12 1961 24 0.008221994
## 13 1962 26 0.008907160
## 14 1963 30 0.010277492
## 15 1964 26 0.008907160
## 16 1965 28 0.009592326
## 17 1966 27 0.009249743
## 18 1967 34 0.011647825
## 19 1968 39 0.013360740
## 20 1969 26 0.008907160
## 21 1970 44 0.015073655
## 22 1971 31 0.010620075
## 23 1972 35 0.011990408
## 24 1973 21 0.007194245
## 25 1974 19 0.006509078
## 26 1975 30 0.010277492
## 27 1976 48 0.016443988
## 28 1977 46 0.015758822
## 29 1978 36 0.012332991
## 30 1979 24 0.008221994
## 31 1980 26 0.008907160
## 32 1981 12 0.004110997
## 33 1982 9 0.003083248
## 34 1983 11 0.003768414
## 35 1984 19 0.006509078
## 36 1985 14 0.004796163
## 37 1986 12 0.004110997
## 38 1987 16 0.005481329
## 39 1988 15 0.005138746
## 40 1989 18 0.006166495
## 41 1990 29 0.009934909
## 42 1991 29 0.009934909
## 43 1992 32 0.010962658
## 44 1993 43 0.014731072
## 45 1994 53 0.018156903
## 46 1995 56 0.019184652
## 47 1996 59 0.020212402
## 48 1997 49 0.016786571
## 49 1998 77 0.026378897
## 50 1999 60 0.020554985
## 51 2000 104 0.035628640
## 52 2001 49 0.016786571
## 53 2002 82 0.028091812
## 54 2003 99 0.033915725
## 55 2004 111 0.038026721
## 56 2005 141 0.048304214
## 57 2006 202 0.069201781
## 58 2007 164 0.056183625
## 59 2008 81 0.027749229
## 60 2009 34 0.011647825
## 61 2010 13 0.004453580
##
## $RoofStyle
## factor_type raw_count percentage
## 1 Flat 20 0.006851662
## 2 Gable 2310 0.791366906
## 3 Gambrel 22 0.007536828
## 4 Hip 551 0.188763275
## 5 Mansard 11 0.003768414
## 6 Shed 5 0.001712915
##
## $RoofMatl
## factor_type raw_count percentage
## 1 ClyTile 1 0.0003425831
## 2 CompShg 2876 0.9852689277
## 3 Membran 1 0.0003425831
## 4 Metal 1 0.0003425831
## 5 Roll 1 0.0003425831
## 6 Tar&Grv 23 0.0078794108
## 7 WdShake 9 0.0030832477
## 8 WdShngl 7 0.0023980815
##
## $Exterior1st
## factor_type raw_count percentage
## 1 AsbShng 44 0.0150788211
## 2 AsphShn 2 0.0006854010
## 3 BrkComm 6 0.0020562029
## 4 BrkFace 87 0.0298149417
## 5 CBlock 2 0.0006854010
## 6 CemntBd 126 0.0431802605
## 7 HdBoard 442 0.1514736121
## 8 ImStucc 1 0.0003427005
## 9 MetalSd 450 0.1542152159
## 10 Plywood 221 0.0757368060
## 11 Stone 2 0.0006854010
## 12 Stucco 43 0.0147361206
## 13 VinylSd 1025 0.3512679918
## 14 Wd Sdng 411 0.1408498972
## 15 WdShing 56 0.0191912269
##
## $Exterior2nd
## factor_type raw_count percentage
## 1 AsbShng 38 0.0130226182
## 2 AsphShn 4 0.0013708019
## 3 Brk Cmn 22 0.0075394106
## 4 BrkFace 47 0.0161069225
## 5 CBlock 3 0.0010281014
## 6 CmentBd 126 0.0431802605
## 7 HdBoard 406 0.1391363948
## 8 ImStucc 15 0.0051405072
## 9 MetalSd 447 0.1531871145
## 10 Other 1 0.0003427005
## 11 Plywood 270 0.0925291295
## 12 Stone 6 0.0020562029
## 13 Stucco 47 0.0161069225
## 14 VinylSd 1014 0.3474982865
## 15 Wd Sdng 391 0.1339958876
## 16 Wd Shng 81 0.0277587389
##
## $MasVnrType
## factor_type raw_count percentage
## 1 BrkCmn 25 0.008635579
## 2 BrkFace 879 0.303626943
## 3 None 1742 0.601727116
## 4 Stone 249 0.086010363
##
## $ExterQual
## factor_type raw_count percentage
## 1 Ex 107 0.03665639
## 2 Fa 35 0.01199041
## 3 Gd 979 0.33538883
## 4 TA 1798 0.61596437
##
## $ExterCond
## factor_type raw_count percentage
## 1 Ex 12 0.004110997
## 2 Fa 67 0.022953066
## 3 Gd 299 0.102432340
## 4 Po 3 0.001027749
## 5 TA 2538 0.869475848
##
## $Foundation
## factor_type raw_count percentage
## 1 BrkTil 311 0.106543337
## 2 CBlock 1235 0.423090099
## 3 PConc 1308 0.448098664
## 4 Slab 49 0.016786571
## 5 Stone 11 0.003768414
## 6 Wood 5 0.001712915
##
## $BsmtQual
## factor_type raw_count percentage
## 1 Ex 258 0.08838643
## 2 Fa 88 0.03014731
## 3 Gd 1209 0.41418294
## 4 no_basement 81 0.02774923
## 5 TA 1283 0.43953409
##
## $BsmtCond
## factor_type raw_count percentage
## 1 Fa 104 0.035628640
## 2 Gd 122 0.041795135
## 3 no_basement 82 0.028091812
## 4 Po 5 0.001712915
## 5 TA 2606 0.892771497
##
## $BsmtExposure
## factor_type raw_count percentage
## 1 Av 418 0.14319973
## 2 Gd 276 0.09455293
## 3 Mn 239 0.08187736
## 4 No 1904 0.65227818
## 5 no_basemente 82 0.02809181
##
## $BsmtFinType1
## factor_type raw_count percentage
## 1 ALQ 429 0.14696814
## 2 BLQ 269 0.09215485
## 3 GLQ 849 0.29085303
## 4 LwQ 154 0.05275779
## 5 no_basement 79 0.02706406
## 6 Rec 288 0.09866393
## 7 Unf 851 0.29153820
##
## $BsmtFinType2
## factor_type raw_count percentage
## 1 ALQ 52 0.01781432
## 2 BLQ 68 0.02329565
## 3 GLQ 34 0.01164782
## 4 LwQ 87 0.02980473
## 5 no_basement 80 0.02740665
## 6 Rec 105 0.03597122
## 7 Unf 2493 0.85405961
##
## $Heating
## factor_type raw_count percentage
## 1 Floor 1 0.0003425831
## 2 GasA 2874 0.9845837616
## 3 GasW 27 0.0092497431
## 4 Grav 9 0.0030832477
## 5 OthW 2 0.0006851662
## 6 Wall 6 0.0020554985
##
## $HeatingQC
## factor_type raw_count percentage
## 1 Ex 1493 0.511476533
## 2 Fa 92 0.031517643
## 3 Gd 474 0.162384378
## 4 Po 3 0.001027749
## 5 TA 857 0.293593696
##
## $CentralAir
## factor_type raw_count percentage
## 1 N 196 0.06714628
## 2 Y 2723 0.93285372
##
## $Electrical
## factor_type raw_count percentage
## 1 FuseA 188 0.0644276902
## 2 FuseF 50 0.0171350240
## 3 FuseP 8 0.0027416038
## 4 Mix 1 0.0003427005
## 5 SBrkr 2671 0.9153529815
##
## $KitchenQual
## factor_type raw_count percentage
## 1 Ex 205 0.07025360
## 2 Fa 70 0.02398903
## 3 Gd 1151 0.39444825
## 4 TA 1492 0.51130912
##
## $Functional
## factor_type raw_count percentage
## 1 Maj1 19 0.0065135413
## 2 Maj2 9 0.0030853617
## 3 Min1 65 0.0222831676
## 4 Min2 70 0.0239972575
## 5 Mod 35 0.0119986287
## 6 Sev 2 0.0006856359
## 7 Typ 2717 0.9314364073
##
## $FireplaceQu
## factor_type raw_count percentage
## 1 Ex 43 0.01473107
## 2 Fa 74 0.02535115
## 3 Gd 744 0.25488181
## 4 no_fireplace 1420 0.48646797
## 5 Po 46 0.01575882
## 6 TA 592 0.20280918
##
## $GarageType
## factor_type raw_count percentage
## 1 2Types 23 0.007879411
## 2 Attchd 1723 0.590270641
## 3 Basment 36 0.012332991
## 4 BuiltIn 186 0.063720452
## 5 CarPort 15 0.005138746
## 6 Detchd 779 0.266872217
## 7 no_garage 157 0.053785543
##
## $GarageYrBlt
## factor_type raw_count percentage
## 1 1895 1 0.0003425831
## 2 1896 1 0.0003425831
## 3 1900 6 0.0020554985
## 4 1906 1 0.0003425831
## 5 1908 1 0.0003425831
## 6 1910 10 0.0034258308
## 7 1914 2 0.0006851662
## 8 1915 7 0.0023980815
## 9 1916 6 0.0020554985
## 10 1917 2 0.0006851662
## 11 1918 3 0.0010277492
## 12 1919 1 0.0003425831
## 13 1920 33 0.0113052415
## 14 1921 5 0.0017129154
## 15 1922 8 0.0027406646
## 16 1923 6 0.0020554985
## 17 1924 8 0.0027406646
## 18 1925 15 0.0051387461
## 19 1926 15 0.0051387461
## 20 1927 5 0.0017129154
## 21 1928 7 0.0023980815
## 22 1929 2 0.0006851662
## 23 1930 27 0.0092497431
## 24 1931 4 0.0013703323
## 25 1932 4 0.0013703323
## 26 1933 1 0.0003425831
## 27 1934 4 0.0013703323
## 28 1935 8 0.0027406646
## 29 1936 7 0.0023980815
## 30 1937 6 0.0020554985
## 31 1938 11 0.0037684138
## 32 1939 21 0.0071942446
## 33 1940 25 0.0085645769
## 34 1941 14 0.0047961631
## 35 1942 6 0.0020554985
## 36 1943 1 0.0003425831
## 37 1945 10 0.0034258308
## 38 1946 9 0.0030832477
## 39 1947 5 0.0017129154
## 40 1948 19 0.0065090785
## 41 1949 14 0.0047961631
## 42 1950 51 0.0174717369
## 43 1951 17 0.0058239123
## 44 1952 16 0.0054813292
## 45 1953 23 0.0078794108
## 46 1954 37 0.0126755738
## 47 1955 24 0.0082219938
## 48 1956 41 0.0140459061
## 49 1957 34 0.0116478246
## 50 1958 42 0.0143884892
## 51 1959 36 0.0123329908
## 52 1960 37 0.0126755738
## 53 1961 31 0.0106200754
## 54 1962 35 0.0119904077
## 55 1963 34 0.0116478246
## 56 1964 35 0.0119904077
## 57 1965 34 0.0116478246
## 58 1966 39 0.0133607400
## 59 1967 36 0.0123329908
## 60 1968 48 0.0164439877
## 61 1969 32 0.0109626584
## 62 1970 32 0.0109626584
## 63 1971 24 0.0082219938
## 64 1972 27 0.0092497431
## 65 1973 29 0.0099349092
## 66 1974 35 0.0119904077
## 67 1975 28 0.0095923261
## 68 1976 50 0.0171291538
## 69 1977 66 0.0226104830
## 70 1978 41 0.0140459061
## 71 1979 35 0.0119904077
## 72 1980 32 0.0109626584
## 73 1981 15 0.0051387461
## 74 1982 9 0.0030832477
## 75 1983 11 0.0037684138
## 76 1984 19 0.0065090785
## 77 1985 18 0.0061664954
## 78 1986 12 0.0041109969
## 79 1987 18 0.0061664954
## 80 1988 20 0.0068516615
## 81 1989 19 0.0065090785
## 82 1990 26 0.0089071600
## 83 1991 17 0.0058239123
## 84 1992 27 0.0092497431
## 85 1993 49 0.0167865707
## 86 1994 39 0.0133607400
## 87 1995 35 0.0119904077
## 88 1996 40 0.0137033231
## 89 1997 44 0.0150736554
## 90 1998 58 0.0198698184
## 91 1999 54 0.0184994861
## 92 2000 55 0.0188420692
## 93 2001 41 0.0140459061
## 94 2002 53 0.0181569030
## 95 2003 92 0.0315176430
## 96 2004 99 0.0339157246
## 97 2005 142 0.0486467968
## 98 2006 115 0.0393970538
## 99 2007 116 0.0397396369
## 100 2008 61 0.0208975677
## 101 2009 29 0.0099349092
## 102 2010 5 0.0017129154
## 103 no_garage 159 0.0544707091
##
## $GarageFinish
## factor_type raw_count percentage
## 1 Fin 719 0.24631723
## 2 no_garage 159 0.05447071
## 3 RFn 811 0.27783487
## 4 Unf 1230 0.42137718
##
## $GarageQual
## factor_type raw_count percentage
## 1 Ex 3 0.001027749
## 2 Fa 124 0.042480301
## 3 Gd 24 0.008221994
## 4 no_garage 159 0.054470709
## 5 Po 5 0.001712915
## 6 TA 2604 0.892086331
##
## $GarageCond
## factor_type raw_count percentage
## 1 Ex 3 0.001027749
## 2 Fa 74 0.025351148
## 3 Gd 15 0.005138746
## 4 no_garage 159 0.054470709
## 5 Po 14 0.004796163
## 6 TA 2654 0.909215485
##
## $PavedDrive
## factor_type raw_count percentage
## 1 N 216 0.07399794
## 2 P 62 0.02124015
## 3 Y 2641 0.90476190
##
## $PoolQC
## factor_type raw_count percentage
## 1 Ex 4 0.0013703323
## 2 Fa 2 0.0006851662
## 3 Gd 4 0.0013703323
## 4 no_pool 2909 0.9965741692
##
## $Fence
## factor_type raw_count percentage
## 1 GdPrv 118 0.040424803
## 2 GdWo 112 0.038369305
## 3 MnPrv 329 0.112709832
## 4 MnWw 12 0.004110997
## 5 no_fence 2348 0.804385063
##
## $MiscFeature
## factor_type raw_count percentage
## 1 Gar2 5 0.0017129154
## 2 none 2814 0.9640287770
## 3 Othr 4 0.0013703323
## 4 Shed 95 0.0325453923
## 5 TenC 1 0.0003425831
##
## $MoSold
## factor_type raw_count percentage
## 1 1 122 0.04179514
## 2 10 173 0.05926687
## 3 11 142 0.04864680
## 4 12 104 0.03562864
## 5 2 133 0.04556355
## 6 3 232 0.07947927
## 7 4 279 0.09558068
## 8 5 394 0.13497773
## 9 6 503 0.17231929
## 10 7 446 0.15279205
## 11 8 233 0.07982186
## 12 9 158 0.05412813
##
## $YrSold
## factor_type raw_count percentage
## 1 2006 619 0.2120589
## 2 2007 692 0.2370675
## 3 2008 622 0.2130867
## 4 2009 647 0.2216513
## 5 2010 339 0.1161357
##
## $SaleType
## factor_type raw_count percentage
## 1 COD 87 0.029814942
## 2 Con 5 0.001713502
## 3 ConLD 26 0.008910212
## 4 ConLI 9 0.003084304
## 5 ConLw 8 0.002741604
## 6 CWD 12 0.004112406
## 7 New 239 0.081905415
## 8 Oth 7 0.002398903
## 9 WD 2525 0.865318711
##
## $SaleCondition
## factor_type raw_count percentage
## 1 Abnorml 190 0.065090785
## 2 AdjLand 12 0.004110997
## 3 Alloca 24 0.008221994
## 4 Family 46 0.015758822
## 5 Normal 2402 0.822884550
## 6 Partial 245 0.083932854
##
## $dataset
## factor_type raw_count percentage
## 1 test 1459 0.4998287
## 2 train 1460 0.5001713
# Quem possue fatores que representam acima de 90%
lista_90 <- lapply(list_of_var_counts,
function(x){
sub_list <- sum(x$percentage > 0.90)
}
)
var_fac_not_over <- which(unlist(lista_90) == 0)
summary(joined_data_factors[ ,var_fac_not_over])
2.Quem possue NA’s?
# Pega index de colunas com NA
index_na <- sapply(joined_data, function(x) which(is.na(x)))
len_na_index <- lapply(index_na, length)
index_coluns_with_na <- as.numeric(which(unlist(len_na_index) > 0))
# Imprime nome das colunas
colnames(joined_data)[index_coluns_with_na]
## [1] "MSZoning" "LotFrontage" "Exterior1st" "Exterior2nd" "MasVnrType"
## [6] "MasVnrArea" "Electrical" "KitchenQual" "Functional" "SaleType"
## [11] "SalePrice"
# Imprime tipo de variavel das colunas com NA
sapply(joined_data[ ,index_coluns_with_na], is.factor)
## MSZoning LotFrontage Exterior1st Exterior2nd MasVnrType MasVnrArea
## TRUE FALSE TRUE TRUE TRUE FALSE
## Electrical KitchenQual Functional SaleType SalePrice
## TRUE TRUE TRUE TRUE FALSE
sapply(joined_data[ ,index_coluns_with_na], is.numeric)
## MSZoning LotFrontage Exterior1st Exterior2nd MasVnrType MasVnrArea
## FALSE TRUE FALSE FALSE FALSE TRUE
## Electrical KitchenQual Functional SaleType SalePrice
## FALSE FALSE FALSE FALSE TRUE
summary(joined_data[ ,index_coluns_with_na])
## MSZoning LotFrontage Exterior1st Exterior2nd
## C (all): 25 Min. : 21.00 VinylSd:1025 VinylSd:1014
## FV : 139 1st Qu.: 59.00 MetalSd: 450 MetalSd: 447
## RH : 26 Median : 68.00 HdBoard: 442 HdBoard: 406
## RL :2265 Mean : 69.31 Wd Sdng: 411 Wd Sdng: 391
## RM : 460 3rd Qu.: 80.00 Plywood: 221 Plywood: 270
## NA's : 4 Max. :313.00 (Other): 369 (Other): 390
## NA's :486 NA's : 1 NA's : 1
## MasVnrType MasVnrArea Electrical KitchenQual Functional
## BrkCmn : 25 Min. : 0.0 FuseA: 188 Ex : 205 Typ :2717
## BrkFace: 879 1st Qu.: 0.0 FuseF: 50 Fa : 70 Min2 : 70
## None :1742 Median : 0.0 FuseP: 8 Gd :1151 Min1 : 65
## Stone : 249 Mean : 102.2 Mix : 1 TA :1492 Mod : 35
## NA's : 24 3rd Qu.: 164.0 SBrkr:2671 NA's: 1 Maj1 : 19
## Max. :1600.0 NA's : 1 (Other): 11
## NA's :23 NA's : 2
## SaleType SalePrice
## WD :2525 Min. : 34900
## New : 239 1st Qu.:129975
## COD : 87 Median :163000
## ConLD : 26 Mean :180921
## CWD : 12 3rd Qu.:214000
## (Other): 29 Max. :755000
## NA's : 1 NA's :1459
3.Estimando pseudo valores (numéricos)
Nessa primeira tentativa, não descartaremos nenhuma linha que contém NA’s. Tentaremos aproveitar todas as informações disponíveis estimando pseudo valores para preencher as lacunas faltantes em cada variável. No caso de variáveis numéricas, iremos aplicar modelos de regressão para estimar o melhor valor possíve, em vez de simplesmente preencher com um valor arbitrário como a média. Optamos por usar o algoritmo de florestas aleatórias por lidar bem tanto com variáveis categóricas como numéricas e também por ser útil em problemas de regressão, não se limitando apenas a problemas de classificação.
# Comeca com as variaveis com menos NA
# Pega colunas numericas com NA
numeric_cols_na <- names(which(sapply(joined_data[ ,index_coluns_with_na], is.numeric)))
summary(joined_data[ ,numeric_cols_na])
## LotFrontage MasVnrArea SalePrice
## Min. : 21.00 Min. : 0.0 Min. : 34900
## 1st Qu.: 59.00 1st Qu.: 0.0 1st Qu.:129975
## Median : 68.00 Median : 0.0 Median :163000
## Mean : 69.31 Mean : 102.2 Mean :180921
## 3rd Qu.: 80.00 3rd Qu.: 164.0 3rd Qu.:214000
## Max. :313.00 Max. :1600.0 Max. :755000
## NA's :486 NA's :23 NA's :1459
3.1.Verificando possíveis correlações entre variáveis
correlations <- cor(na.omit(joined_data[ , col_numeric[c(-32)] ])) # Tirando lotFrontage, pois tem muitos NA
corrplot(correlations, method = c('square'))
3.2.Variável ‘MasVnrArea’
# Esse tipo de detalhe nao eh caracteristico de casas antigas. sim ou nao?
joined_data %>% ggplot(aes(x=YearBuilt, y=MasVnrArea, col=MasVnrType)) +
geom_point(size=2, alpha=0.8) +
theme_bw()
random_forest_model <- train(MasVnrArea ~ MSSubClass + LotArea + LotShape + LandContour + Neighborhood + Condition1 + BldgType + HouseStyle + OverallQual + OverallCond + YearBuilt + Foundation + BsmtQual + BsmtFinType1 + GrLivArea + FireplaceQu + GarageType ,
data = na.omit(joined_data[ ,c(-1,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,8,10,15,20) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE))
print(random_forest_model)
plot(random_forest_model)
prediction <- predict(random_forest_model, joined_data)
joined_data$MasVnrArea[ which(is.na(joined_data$MasVnrArea)) ] <- as.integer(prediction[which(is.na(joined_data$MasVnrArea))])
3.3.Variável ‘LotFrontage’
lot_var <- joined_data %>% select(contains('Lot'))
lot_var %>% ggplot(aes(x=LotFrontage, y=LotArea, col=LotShape)) +
geom_point(size=3, alpha=0.6) +
facet_grid(~ LotShape) +
scale_y_log10() +
theme_bw()
joined_data %>%
select(contains('Lot')) %>%
filter(is.na(LotFrontage)) %>%
group_by(LotShape) %>%
summarize(mean_area = mean(LotArea), lotShape_counts = n())
random_forest_model <- train(LotFrontage ~ LotArea + LotShape + LotConfig + MSSubClass + LandContour + Neighborhood + Condition1
+ BldgType + HouseStyle + OverallQual + OverallCond +
YearBuilt + Foundation + BsmtQual + BsmtFinType1 + GrLivArea + FireplaceQu + GarageType,
data = na.omit(joined_data[ ,c(-1,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,8,10,12,14,18) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE)
)
print(random_forest_model)
plot(random_forest_model)
prediction <- predict( random_forest_model, joined_data)
data.frame(original= joined_data$LotFrontage, prediction)
joined_data$LotFrontage[ which(is.na(joined_data$LotFrontage)) ] <- as.integer(prediction[which(is.na(joined_data$LotFrontage))])
4.Estimando pseudo valores (categóricos)
4.1.Variável ‘MSZoning’
random_forest_model <- train(MSZoning ~ LotFrontage + LotArea + LotShape + LotConfig + MSSubClass +
LandContour + Neighborhood + Condition1 +
BldgType + HouseStyle + OverallQual + OverallCond +
YearBuilt + Foundation + BsmtQual + BsmtFinType1 +
GrLivArea + FireplaceQu + GarageType,
data = na.omit(joined_data[ ,c(-1,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,8,10,12,14,18) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE)
)
print(random_forest_model)
plot(random_forest_model)
prediction <- predict( random_forest_model, joined_data)
data.frame(original= joined_data$MSZoning, prediction)
joined_data$MSZoning[ which(is.na(joined_data$MSZoning)) ] <- prediction[which(is.na(joined_data$MSZoning))]
4.2.Variável ‘Exterior1st’
# Pegadinha $#%$!, tirar garage year built pq tem NA nas unicas classes
random_forest_model <- train(Exterior1st ~ LotFrontage + LotArea + LotShape + LotConfig + MSSubClass +
LandContour + Neighborhood + Condition1 +
BldgType + HouseStyle + OverallQual + OverallCond +
YearBuilt + Foundation + BsmtQual + BsmtFinType1 +
GrLivArea + FireplaceQu + GarageType,
data = na.omit(joined_data[ ,c(-1,-59,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,8,10,12,14,18) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE)
)
print(random_forest_model)
plot(random_forest_model)
prediction <- predict( random_forest_model, joined_data)
View(data.frame(original= joined_data$Exterior1st, prediction))
joined_data$Exterior1st[ which(is.na(joined_data$Exterior1st)) ] <- prediction[which(is.na(joined_data$Exterior1st))]
4.3.Variável ‘Exterior2nd’
random_forest_model <- train(Exterior2nd ~ LotFrontage + LotArea + LotShape + LotConfig + MSSubClass +
LandContour + Neighborhood + Condition1 +
BldgType + HouseStyle + OverallQual + OverallCond +
YearBuilt + Foundation + BsmtQual + BsmtFinType1 +
GrLivArea + FireplaceQu + GarageType,
data = na.omit(joined_data[ ,c(-1,-59,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,8,10,12,14,18, 20) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE)
)
print(random_forest_model)
plot(random_forest_model)
prediction <- predict( random_forest_model, joined_data)
joined_data$Exterior2nd[ which(is.na(joined_data$Exterior2nd)) ] <- prediction[which(is.na(joined_data$Exterior2nd))]
4.4.Variável ‘MasVnrType’
random_forest_model <- train(MasVnrType ~ LotFrontage + LotArea + LotShape + LotConfig + MSSubClass +
LandContour + Neighborhood + Condition1 +
BldgType + HouseStyle + OverallQual + OverallCond +
YearBuilt + Foundation + BsmtQual + BsmtFinType1 +
GrLivArea + FireplaceQu + GarageType,
data = na.omit(joined_data[ ,c(-1,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,10,15,28) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE)
)
print(random_forest_model)
plot(random_forest_model)
summary(random_forest_model)
prediction <- predict( random_forest_model, joined_data)
joined_data$MasVnrType[ which(is.na(joined_data$MasVnrType)) ] <- prediction[which(is.na(joined_data$MasVnrType))]
4.5.Variável ‘Electrical’
random_forest_model <- train(Electrical ~ MasVnrType + LotFrontage + LotArea + LotShape + LotConfig + MSSubClass +
LandContour + Neighborhood + Condition1 +
BldgType + HouseStyle + OverallQual + OverallCond +
YearBuilt + Foundation + BsmtQual + BsmtFinType1 +
GrLivArea + GarageType,
data = na.omit(joined_data[ ,c(-1,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,10,15,19) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE)
)
print(random_forest_model)
plot(random_forest_model)
summary(random_forest_model)
prediction <- predict( random_forest_model, joined_data)
joined_data$Electrical[ which(is.na(joined_data$Electrical)) ] <- prediction[which(is.na(joined_data$Electrical))]
4.6.Variável ‘SaleType’
random_forest_model <- train(SaleType ~ MasVnrType + LotFrontage + LotArea + LotShape + LotConfig + MSSubClass +
LandContour + Neighborhood + Condition1 +
BldgType + HouseStyle + OverallQual + OverallCond +
YearBuilt + Foundation + BsmtQual + BsmtFinType1 +
GrLivArea + GarageType,
data = na.omit(joined_data[ ,c(-1,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,10,15,19) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE)
)
print(random_forest_model)
plot(random_forest_model)
prediction <- predict( random_forest_model, joined_data)
joined_data$SaleType[ which(is.na(joined_data$SaleType)) ] <- prediction[which(is.na(joined_data$SaleType))]
4.7.Variável ‘Functional’
random_forest_model <- train(Functional ~ SaleType + MasVnrType + LotFrontage + LotArea + LotShape + LotConfig + MSSubClass +
LandContour + Neighborhood + Condition1 +
BldgType + HouseStyle + OverallQual + OverallCond +
YearBuilt + Foundation + BsmtQual + BsmtFinType1 +
GrLivArea + GarageType,
data = na.omit(joined_data[ ,c(-1,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,10,15,19) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE)
)
print(random_forest_model)
plot(random_forest_model)
summary(random_forest_model)
prediction <- predict( random_forest_model, joined_data)
joined_data$Functional[ which(is.na(joined_data$Functional)) ] <- prediction[which(is.na(joined_data$Functional))]
4.8.Variável ‘KitchenQual’
random_forest_model <- train(KitchenQual ~ SaleType + MasVnrType + LotFrontage + LotArea + LotShape + LotConfig + MSSubClass +
LandContour + Neighborhood + Condition1 +
BldgType + HouseStyle + OverallQual + OverallCond +
YearBuilt + Foundation + BsmtQual + BsmtFinType1 +
GrLivArea + GarageType,
data = na.omit(joined_data[ ,c(-1,-80,-81)]),
tuneGrid= data.frame( mtry=c(5,10,15,19) ),
method= "rf",
trControl= trainControl(method='cv', number=5, verboseIter = TRUE)
)
print(random_forest_model)
plot(random_forest_model)
prediction <- predict( random_forest_model, joined_data)
joined_data$KitchenQual[ which(is.na(joined_data$KitchenQual)) ] <- prediction[which(is.na(joined_data$KitchenQual))]
write.table(file='joined_data_cleaned_without_NA.txt', joined_data, sep = '\t', row.names = F)
5.Explorando os dados
5.1.Carrega os dados salvos
joined_data <- read.table(file='/home/davi/Desktop/house_prices-_advanced_regression_techniques/joined_data_cleaned_without_NA.txt', sep = '\t', header = TRUE)
joined_data$GarageYrBlt <- as.factor(joined_data$GarageYrBlt)
joined_data$Alley[which(is.na(joined_data$Alley))] <- 'no_alley_access'
joined_data$BsmtQual[which(is.na(joined_data$BsmtQual))] <- 'no_basement'
joined_data$BsmtCond[which(is.na(joined_data$BsmtCond))] <- 'no_basement'
joined_data$BsmtFinType1[which(is.na(joined_data$BsmtFinType1))] <- 'no_basement'
joined_data$BsmtFinType2[which(is.na(joined_data$BsmtFinType2))] <- 'no_basement'
joined_data$PoolQC[which(is.na(joined_data$PoolQC))] <- 'no_pool'
joined_data$Fence[which(is.na(joined_data$Fence))] <- 'no_fence'
joined_data$MiscFeature[which(is.na(joined_data$MiscFeature))] <- 'none'
joined_data$BsmtExposure[which(is.na(joined_data$BsmtExposure))] <- 'no_basemente'
joined_data$FireplaceQu[which(is.na(joined_data$FireplaceQu))] <- 'no_fireplace'
joined_data$GarageType[which(is.na(joined_data$GarageType))] <- 'no_garage'
joined_data$GarageFinish[which(is.na(joined_data$GarageFinish))] <- 'no_garage'
joined_data$GarageQual[which(is.na(joined_data$GarageQual))] <- 'no_garage'
joined_data$GarageCond[which(is.na(joined_data$GarageCond))] <- 'no_garage'
# Definindo e convertendo as variaveis categoricas (factors)
joined_data$MSSubClass <- as.factor(joined_data$MSSubClass)
joined_data$OverallQual <- as.factor(joined_data$OverallQual)
joined_data$OverallCond <- as.factor(joined_data$OverallCond)
joined_data$GarageYrBlt <- as.factor(joined_data$GarageYrBlt)
joined_data$YrSold <- as.factor(joined_data$YrSold)
joined_data$MoSold <- as.factor(joined_data$MoSold)
joined_data$GarageYrBlt <- as.factor(joined_data$GarageYrBlt)
joined_data$YearBuilt <- as.factor(joined_data$YearBuilt)
joined_data$YearRemodAdd <- as.factor(joined_data$YearRemodAdd)
joined_data$GarageCars <- as.factor(joined_data$GarageCars)
joined_data <- joined_data %>% mutate(age_until_sale = as.numeric(as.character(YrSold)) - as.numeric(as.character(YearBuilt)))
# Criar nova variavel: area_habitada_casa
# TotalBsmtSF: Total square feet of basement area
# GrLivArea: Above grade (ground) living area square feet
# GarageArea: Size of garage in square feet
# WoodDeckSF: Wood deck area in square feet
# OpenPorchSF: Open porch area in square feet
# EnclosedPorch: Enclosed porch area in square feet
# 3SsnPorch: Three season porch area in square feet
# ScreenPorch: Screen porch area in square feet
# PoolArea: Pool area in square feet
joined_data <- joined_data %>% mutate(house_useful_area = GrLivArea + GarageArea +
WoodDeckSF + OpenPorchSF + PoolArea)
Separando finalmente o cojunto de treino e teste
train <- joined_data %>%
filter(dataset == 'train')
test <- joined_data %>%
filter(dataset == 'test')
Explorando a relacção entre algumas variáveis de interesse para poderem ser usadas na contrução dos modelos
Casas mais novas parecem valer mais no geral? Muito diferente do que esperávamos.
train %>%
ggplot(aes(x= age_until_sale, y= SalePrice, col=Neighborhood)) +
geom_point(size=4, alpha=0.4)+
theme_bw()
Basement qual tem muita influência no preço da casa.
train %>%
ggplot(aes(x= BsmtQual, y= SalePrice)) +
geom_boxplot()+
theme_bw()
GrLivArea vs SalePrice tem uma relação linear aparente
“Above Grade Living Area Total Sq. Ft. represents the finished above grade living area in a house. It does not include unfinished areas, the area occupied by a cathedral ceiling, enclosed non-living areas such as garages and enclosed porches, or basement area or finished basement living space”.
train %>%
ggplot(aes(x= GrLivArea, y= SalePrice)) +
geom_point(size=4, alpha=0.4)+
theme_bw()
# Fazendo uma regressão com e sem o outlier
summary(lm(SalePrice ~ GrLivArea, data=train))
##
## Call:
## lm(formula = SalePrice ~ GrLivArea, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -462999 -29800 -1124 21957 339832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18569.026 4480.755 4.144 3.61e-05 ***
## GrLivArea 107.130 2.794 38.348 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56070 on 1458 degrees of freedom
## Multiple R-squared: 0.5021, Adjusted R-squared: 0.5018
## F-statistic: 1471 on 1 and 1458 DF, p-value: < 2.2e-16
summary(lm(SalePrice ~ GrLivArea, data=train[-which(train$GrLivArea > 4500), ]))
##
## Call:
## lm(formula = SalePrice ~ GrLivArea, data = train[-which(train$GrLivArea >
## 4500), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -197730 -29815 -337 23239 332534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7168.970 4432.501 1.617 0.106
## GrLivArea 115.040 2.782 41.358 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53920 on 1456 degrees of freedom
## Multiple R-squared: 0.5402, Adjusted R-squared: 0.5399
## F-statistic: 1710 on 1 and 1456 DF, p-value: < 2.2e-16
Interessante: Parece que a area do terreno não influencia muito no preço da casa.
train %>%
ggplot(aes(x= LotArea, y= SalePrice, col= Neighborhood)) +
geom_point(size=3, alpha=0.4)+
theme_bw()
Casas com piscina, sendo elas de boa qualidade, valem muito mais.
train %>%
ggplot(aes(x= PoolQC, y= SalePrice)) +
geom_boxplot()+
theme_bw()
Qualidade da casa vs preço, ótima relação.
train %>%
ggplot(aes(x= OverallQual, y= SalePrice)) +
geom_boxplot()+
theme_bw()
Somei todas as “áreas úteis da casa” para testar se a relação melhora. Ao ver, sim!
train %>%
ggplot(aes(x= house_useful_area, y= SalePrice, col=OverallQual)) +
geom_point(size=3, alpha=0.7)+
theme_bw() +
scale_color_brewer(palette='Spectral')
6. Treinando e validando o modelo para o conjunto de teste
6.1. Random forest
random_forest_model <- train(SalePrice~ .,
data = train,
tuneGrid= data.frame(mtry=c(30, 40, 50, 55, 60,65, 70,75,79,80,82)),
method= "rf",
trControl= trainControl(method='cv', number=50, verboseIter = TRUE ))
plot(random_forest_model)
print(random_forest_model)
prediction <- predict(random_forest_model, test)
to_submit <- data.frame(Id= test$Id , SalePrice = prediction)
write.csv(file='prediction_results.csv', to_submit, row.names = F )
6.2.Modelo linear
Testando modelo linear sem os ‘outliers’
lm_model <- lm(SalePrice ~ house_useful_area + OverallQual + GrLivArea +
TotalBsmtSF + GarageArea +
ExterQual + X1stFlrSF + BsmtQual +
KitchenQual + FullBath +TotRmsAbvGrd, data=train[c(-524, -1299), ])
summary(lm_model)
summary(train)
prediction <- predict(lm_model, test)
to_submit <- data.frame(Id= test$Id , SalePrice = prediction)
write.csv(file='prediction_results.csv', to_submit, row.names = F )
summary(to_submit)
6.3.SVM (Support Machine Vector)
svm_model <- train(SalePrice ~ OverallQual + house_useful_area + age_until_sale + PoolQC + Neighborhood,
data = train[c(-524, -1299), ],
method= "svmPoly",
trControl= trainControl(method='cv', number=3, verboseIter = TRUE ))
plot(svm_model)
print(svm_model)
prediction <- predict(svm_model, test)
to_submit <- data.frame(Id= test$Id , SalePrice = prediction)
write.csv(file='prediction_results.csv', to_submit, row.names = F )
prediction <- predict(svm_model, train)
6.4.GBM (Gradient Boosting Machine)
metric <- "RMSE"
trainControl <- trainControl(method="cv", number=30)
caretGrid <- expand.grid(interaction.depth= c(1,4,5,6), n.trees= c(2500,3000, 3200, 3400, 3600) ,
shrinkage= c(0.1,0.01,0.02,0.03),
n.minobsinnode=10)
set.seed(99)
gbm.caret <- train(SalePrice ~ .
, data=train
, distribution="gaussian"
, method="gbm"
, trControl=trainControl
, verbose=FALSE
, tuneGrid=caretGrid
, metric=metric
, bag.fraction=0.75
)
print(gbm.caret)
plot(gbm.caret)
prediction <- predict(gbm.caret, test)
to_submit <- data.frame(Id= test$Id , SalePrice = prediction)
write.csv(file='prediction_results.csv', to_submit, row.names = F )
7.1.Resultados obtidos
Ainda há muito a trabalhar nestes dados. O melhor resultado foi obtido com o algoritmo GBM. No Kaggle, conseguimos alcançar uma posição mediana em relação aos demais competidores. Nossa melhor pontuação foi de 0.13477
Posição no ranking geral