install.packages
(
"data.table"
)
install.packages
(
"dplyr"
)
install.packages
(
"ggplot2"
)
install.packages
(
"caret"
)
install.packages
(
"xgboost"
)
install.packages
(
"e1071"
)
install.packages
(
"cowplot"
)
library
(data.table)
library
(dplyr)
library
(ggplot2)
library
(caret)
library
(xgboost)
library
(e1071)
library
(cowplot)
test[, Item_Outlet_Sales :=
NA
]
combi =
rbind
(train, test)
missing_index =
which
(
is.na
(combi$Item_Weight))
for
(i
in
missing_index){
item = combi$Item_Identifier[i]
combi$Item_Weight[i] =
mean
(combi$Item_Weight
[combi$Item_Identifier == item],
na.rm = T)
}
zero_index =
which
(combi$Item_Visibility == 0)
for
(i
in
zero_index){
item = combi$Item_Identifier[i]
combi$Item_Visibility[i] =
mean
(
combi$Item_Visibility[combi$Item_Identifier == item],
na.rm = T)
}
combi[, Outlet_Size_num :=
ifelse
(Outlet_Size ==
"Small"
, 0,
ifelse
(Outlet_Size ==
"Medium"
, 1, 2))]
combi[, Outlet_Location_Type_num :=
ifelse
(Outlet_Location_Type ==
"Tier 3"
, 0,
ifelse
(Outlet_Location_Type ==
"Tier 2"
, 1, 2))]
combi[,
c
(
"Outlet_Size"
,
"Outlet_Location_Type"
) :=
NULL
]
ohe_1 =
dummyVars
(
"~."
,
data = combi[, -
c
(
"Item_Identifier"
,
"Outlet_Establishment_Year"
,
"Item_Type"
)], fullRank = T)
ohe_df =
data.table
(
predict
(ohe_1,
combi[, -
c
(
"Item_Identifier"
,
"Outlet_Establishment_Year"
,
"Item_Type"
)]))
combi =
cbind
(combi[,
"Item_Identifier"
], ohe_df)
skewness
(combi$Item_Visibility)
skewness
(combi$price_per_unit_wt)
combi[, Item_Visibility :=
log
(Item_Visibility + 1)]
num_vars =
which
(
sapply
(combi, is.numeric))
num_vars_names =
names
(num_vars)
combi_numeric = combi[,
setdiff
(num_vars_names,
"Item_Outlet_Sales"
), with = F]
prep_num =
preProcess
(combi_numeric,
method =
c
(
"center"
,
"scale"
))
combi_numeric_norm =
predict
(prep_num, combi_numeric)
combi[,
setdiff
(num_vars_names,
"Item_Outlet_Sales"
) :=
NULL
]
combi =
cbind
(combi,
combi_numeric_norm)
train = combi[1:
nrow
(train)]
test = combi[(
nrow
(train) + 1):
nrow
(combi)]
test[, Item_Outlet_Sales :=
NULL
]
param_list =
list
(
objective =
"reg:linear"
,
eta = 0.01,
gamma = 1,
max_depth = 6,
subsample = 0.8,
colsample_bytree = 0.5
)
Dtrain =
xgb.DMatrix
(
data =
as.matrix
(train[, -
c
(
"Item_Identifier"
,
"Item_Outlet_Sales"
)]),
label = train$Item_Outlet_Sales)
Dtest =
xgb.DMatrix
(
data =
as.matrix
(test[, -
c
(
"Item_Identifier"
)]))
set.seed
(112)
xgbcv =
xgb.cv
(params = param_list,
data = Dtrain,
nrounds = 1000,
nfold = 5,
print_every_n = 10,
early_stopping_rounds = 30,
maximize = F)
xgb_model =
xgb.train
(data = Dtrain,
params = param_list,
nrounds = 428)
xgb_model