pip install pytorch
-
tabnet
from
pytorch_tabnet.tab_model
import
TabNetClassifier
import
os
import
torch
import
pandas as pd
import
numpy as np
from
sklearn.model_selection
import
KFold
from
sklearn.preprocessing
import
LabelEncoder, MinMaxScalar
from
sklearn.metrics
import
accuracy_score
data
=
pd.read_csv(
'/content/train.csv'
)
data.head()
data.isna().
sum
()
test_data
=
pd.read_csv(
'/content/test.csv'
)
test_data.head()
test_data.isna().
sum
()
data.set_index(
'Loan_ID'
, inplace
=
True
)
test_data.set_index(
'Loan_ID'
, inplace
=
True
)
data.fillna(method
=
"bfill"
, inplace
=
True
)
test_data.fillna(method
=
"bfill"
, inplace
=
True
)
gen
=
LabelEncoder().fit(data[
'Gender'
])
data[
'Gender'
]
=
gen.transform(data[
'Gender'
])
s_type
=
LabelEncoder().fit(data[
'Married'
])
data[
'Married'
]
=
s_type.transform(data[
'Married'
])
n_dep
=
LabelEncoder().fit(data[
'Dependents'
])
data[
'Dependents'
]
=
n_dep.transform(data[
'Dependents'
])
edu
=
LabelEncoder().fit(data[
'Education'
])
data[
'Education'
]
=
edu.transform(data[
'Education'
])
s_emp
=
LabelEncoder().fit(data[
'Self_Employed'
])
data[
'Self_Employed'
]
=
s_emp.transform(data[
'Self_Employed'
])
c_history
=
LabelEncoder().fit(data[
'Credit_History'
])
data[
'Credit_History'
]
=
c_history.transform(data[
'Credit_History'
])
p_area
=
LabelEncoder().fit(data[
'Property_Area'
])
data[
'Property_Area'
]
=
p_area.transform(data[
'Property_Area'
])
l_status
=
LabelEncoder().fit(data[
'Loan_Status'
])
data[
'Loan_Status'
]
=
l_status.transform(data[
'Loan_Status'
])
test_data[
'Gender'
]
=
gen.transform(test_data[
'Gender'
])
test_data[
'Married'
]
=
s_type.transform(test_data[
'Married'
])
test_data[
'Dependents'
]
=
n_dep.transform(test_data[
'Dependents'
])
test_data[
'Education'
]
=
edu.transform(test_data[
'Education'
])
test_data[
'Self_Employed'
]
=
s_emp.transform(test_data[
'Self_Employed'
])
test_data[
'Credit_History'
]
=
c_history.transform(test_data[
'Credit_History'
])
test_data[
'Property_Area'
]
=
p_area.transform(test_data[
'Property_Area'
])
X
=
data.loc[:,data.columns !
=
'Loan_Status'
]
y
=
data.loc[:,data.columns
=
=
'Loan_Status'
]
X.shape, y.shape
X
=
X.to_numpy()
y
=
y.to_numpy()
y
=
y.flatten()
kf
=
KFold(n_splits
=
5
, random_state
=
42
, shuffle
=
True
)
CV_score_array
=
[]
for
train_index, test_index
in
kf.split(X):
X_train, X_valid
=
X[train_index], X[test_index]
y_train, y_valid
=
y[train_index], y[test_index]
tb_cls
=
TabNetClassifier(optimizer_fn
=
torch.optim.Adam,
optimizer_params
=
dict
(lr
=
1e
-
3
),
scheduler_params
=
{
"step_size"
:
10
,
"gamma"
:
0.9
},
scheduler_fn
=
torch.optim.lr_scheduler.StepLR,
mask_type
=
'entmax'
)
tb_cls.fit(X_train,y_train,
eval_set
=
[(X_train, y_train), (X_val, y_val)],
eval_name
=
[
'train'
,
'valid'
],
eval_metric
=
[
'accuracy'
],
max_epochs
=
1000
, patience
=
100
,
batch_size
=
28
, drop_last
=
False
)
CV_score_array.append(tb_cls.best_cost)
predictions
=
[
'N'
if
i <
0.5
else
'Y'
for
i
in
tb_cls.predict(X_test)]