library
(data.table)
library
(dplyr)
library
(ggplot2)
library
(caret)
library
(e1071)
library
(corrplot)
library
(xgboost)
library
(cowplot)
train =
fread
(
"Train_UWu5bXk.csv"
)
test =
fread
(
"Test_u94Q5KV.csv"
)
str
(train)
test[, Item_Outlet_Sales :=
NA
]
combi =
rbind
(train, test)
missing_index =
which
(
is.na
(combi$Item_Weight))
for
(i
in
missing_index){
item = combi$Item_Identifier[i]
combi$Item_Weight[i] =
mean
(combi$Item_Weight
[combi$Item_Identifier == item],
na.rm = T)
}
zero_index =
which
(combi$Item_Visibility == 0)
for
(i
in
zero_index){
item = combi$Item_Identifier[i]
combi$Item_Visibility[i] =
mean
(
combi$Item_Visibility[combi$Item_Identifier == item],
na.rm = T
)
}
perishable =
c
(
"Breads"
,
"Breakfast"
,
"Dairy"
,
"Fruits and Vegetables"
,
"Meat"
,
"Seafood"
)
non_perishable =
c
(
"Baking Goods"
,
"Canned"
,
"Frozen Foods"
,
"Hard Drinks"
,
"Health and Hygiene"
,
"Household"
,
"Soft Drinks"
)
combi[,Item_Type_new :=
ifelse
(Item_Type %
in
% perishable,
"perishable"
,
ifelse
(Item_Type %
in
% non_perishable,
"non_perishable"
,
"not_sure"
))]
combi[,Item_category :=
substr
(combi$Item_Identifier, 1, 2)]
combi$Item_Fat_Content[combi$Item_category ==
"NC"
] =
"Non-Edible"
combi[,Outlet_Years := 2013 - Outlet_Establishment_Year]
combi$Outlet_Establishment_Year =
as.factor
(combi$Outlet_Establishment_Year)
combi[,price_per_unit_wt := Item_MRP/Item_Weight]
combi[,Outlet_Size_num :=
ifelse
(Outlet_Size ==
"Small"
, 0,
ifelse
(Outlet_Size ==
"Medium"
, 1, 2))]
combi[,Outlet_Location_Type_num :=
ifelse
(Outlet_Location_Type ==
"Tier 3"
, 0,
ifelse
(Outlet_Location_Type ==
"Tier 2"
, 1, 2))]
combi[,
c
(
"Outlet_Size"
,
"Outlet_Location_Type"
) :=
NULL
]
ohe =
dummyVars
(
"~."
, data = combi[,-
c
(
"Item_Identifier"
,
"Outlet_Establishment_Year"
,
"Item_Type"
)], fullRank = T)
ohe_df =
data.table
(
predict
(ohe, combi[,-
c
(
"Item_Identifier"
,
"Outlet_Establishment_Year"
,
"Item_Type"
)]))
combi =
cbind
(combi[,
"Item_Identifier"
], ohe_df)
skewness
(combi$Item_Visibility)
skewness
(combi$price_per_unit_wt)
combi[,Item_Visibility :=
log
(Item_Visibility + 1)]
combi[,price_per_unit_wt :=
log
(price_per_unit_wt + 1)]
num_vars =
which
(
sapply
(combi, is.numeric))
num_vars_names =
names
(num_vars)
combi_numeric = combi[,
setdiff
(num_vars_names,
"Item_Outlet_Sales"
), with = F]
prep_num =
preProcess
(combi_numeric, method=
c
(
"center"
,
"scale"
))
combi_numeric_norm =
predict
(prep_num, combi_numeric)
combi[,
setdiff
(num_vars_names,
"Item_Outlet_Sales"
) :=
NULL
]
combi =
cbind
(combi, combi_numeric_norm)
train = combi[1:
nrow
(train)]
test = combi[(
nrow
(train) + 1):
nrow
(combi)]
test[,Item_Outlet_Sales :=
NULL
]
para_list =
list
(
objective =
"reg:linear"
,
eta=0.01,
gamma = 1,
max_depth=6,
subsample=0.8,
colsample_bytree=0.5
)
d_train =
xgb.DMatrix
(data =
as.matrix
(train[,-
c
(
"Item_Identifier"
,
"Item_Outlet_Sales"
)]),
label= train$Item_Outlet_Sales)
d_test =
xgb.DMatrix
(data =
as.matrix
(test[,-
c
(
"Item_Identifier"
)]))
set.seed
(123)
xgb_cv =
xgb.cv
(params = para_list,
data = d_train,
nrounds = 1000,
nfold = 5,
print_every_n = 10,
early_stopping_rounds = 30,
maximize = F)
model_xgb =
xgb.train
(data = d_train,
params = para_list,
nrounds = 428)
model_xgb
variable_imp =
xgb.importance
(feature_names =
setdiff
(
names
(train),
c
(
"Item_Identifier"
,
"Item_Outlet_Sales"
)),
model = model_xgb)
xgb.plot.importance
(variable_imp)