install.packages
(
"data.table"
)
install.packages
(
"dplyr"
)
install.packages
(
"glmnet"
)
install.packages
(
"ggplot2"
)
install.packages
(
"caret"
)
install.packages
(
"xgboost"
)
install.packages
(
"e1071"
)
install.packages
(
"cowplot"
)
library
(data.table)
library
(dplyr)
library
(glmnet)
library
(ggplot2)
library
(caret)
library
(xgboost)
library
(e1071)
library
(cowplot)
train =
fread
(
"Train_UWu5bXk.csv"
)
test =
fread
(
"Test_u94Q5KV.csv"
)
test[, Item_Outlet_Sales :=
NA
]
combi =
rbind
(train, test)
missing_index =
which
(
is.na
(combi$Item_Weight))
for
(i
in
missing_index)
{
item = combi$Item_Identifier[i]
combi$Item_Weight[i] =
mean
(combi$Item_Weight[combi$Item_Identifier == item],
na.rm = T)
}
zero_index =
which
(combi$Item_Visibility == 0)
for
(i
in
zero_index)
{
item = combi$Item_Identifier[i]
combi$Item_Visibility[i] =
mean
(combi$Item_Visibility[combi$Item_Identifier == item],
na.rm = T)
}
combi[, Outlet_Size_num :=
ifelse
(Outlet_Size ==
"Small"
, 0,
ifelse
(Outlet_Size ==
"Medium"
, 1, 2))]
combi[, Outlet_Location_Type_num :=
ifelse
(Outlet_Location_Type ==
"Tier 3"
, 0,
ifelse
(Outlet_Location_Type ==
"Tier 2"
, 1, 2))]
combi[,
c
(
"Outlet_Size"
,
"Outlet_Location_Type"
) :=
NULL
]
ohe_1 =
dummyVars
(
"~."
, data = combi[, -
c
(
"Item_Identifier"
,
"Outlet_Establishment_Year"
,
"Item_Type"
)], fullRank = T)
ohe_df =
data.table
(
predict
(ohe_1, combi[, -
c
(
"Item_Identifier"
,
"Outlet_Establishment_Year"
,
"Item_Type"
)]))
combi =
cbind
(combi[,
"Item_Identifier"
], ohe_df)
skewness
(combi$Item_Visibility)
skewness
(combi$price_per_unit_wt)
combi[, Item_Visibility :=
log
(Item_Visibility + 1)]
num_vars =
which
(
sapply
(combi, is.numeric))
num_vars_names =
names
(num_vars)
combi_numeric = combi[,
setdiff
(num_vars_names,
"Item_Outlet_Sales"
),
with = F]
prep_num =
preProcess
(combi_numeric, method=
c
(
"center"
,
"scale"
))
combi_numeric_norm =
predict
(prep_num, combi_numeric)
combi[,
setdiff
(num_vars_names,
"Item_Outlet_Sales"
) :=
NULL
]
combi =
cbind
(combi, combi_numeric_norm)
train = combi[1:
nrow
(train)]
test = combi[(
nrow
(train) + 1):
nrow
(combi)]
test[, Item_Outlet_Sales :=
NULL
]
set.seed
(123)
control =
trainControl
(method =
"cv"
, number = 5)
Grid_la_reg =
expand.grid
(alpha = 1, lambda =
seq
(0.001,
0.1, by = 0.0002))
set.seed
(123)
control =
trainControl
(method =
"cv"
, number = 5)
Grid_ri_reg =
expand.grid
(alpha = 0, lambda =
seq
(0.001, 0.1,
by = 0.0002))
Ridge_model =
train
(x = train[, -
c
(
"Item_Identifier"
,
"Item_Outlet_Sales"
)],
y = train$Item_Outlet_Sales,
method =
"glmnet"
,
trControl = control,
tuneGrid = Grid_reg
)
Ridge_model
mean
(Ridge_model$resample$RMSE)
plot
(Ridge_model, main=
"Ridge Regression"
)