from
pyspark.ml.feature
import
StringIndexer
from
pyspark.ml
import
Pipeline
feat_list
=
[
'MSZoning'
,
'LotFrontage'
,
'Street'
,
'LotShape'
,
'LandContour'
,
'Utilities'
,
'LotConfig'
,
'LandSlope'
,
'Neighborhood'
,
'Condition1'
,
'Condition2'
,
'BldgType'
,
'HouseStyle'
,
'RoofStyle'
,
'RoofMatl'
,
'Exterior1st'
,
'Exterior2nd'
,
'MasVnrType'
,
'MasVnrArea'
,
'ExterQual'
,
'ExterCond'
,
'Foundation'
,
'BsmtQual'
,
'BsmtCond'
,
'BsmtExposure'
,
'BsmtFinType1'
,
'BsmtFinType2'
,
'Heating'
,
'HeatingQC'
,
'CentralAir'
,
'Electrical'
,
'KitchenQual'
,
'Functional'
,
'FireplaceQu'
,
'GarageType'
,
'GarageYrBlt'
,
'GarageFinish'
,
'GarageQual'
,
'GarageCond'
,
'PavedDrive'
,
'SaleType'
,
'SaleCondition'
]
print
(
'indexed list created'
)
indexers
=
[StringIndexer(inputCol
=
column, outputCol
=
column
+
"_index"
).fit(df_new)
for
column
in
feat_list]
type
(indexers)
pipeline
=
Pipeline(stages
=
indexers)
df_feat
=
pipeline.fit(df_new).transform(df_new)
df_feat.columns
from
pyspark.ml.linalg
import
Vectors
from
pyspark.ml.feature
import
VectorAssembler
assembler
=
VectorAssembler(inputCols
=
[
'MSSubClass'
,
'LotArea'
,
'OverallQual'
,
'OverallCond'
,
'YearBuilt'
,
'YearRemodAdd'
,
'BsmtFinSF1'
,
'BsmtUnfSF'
,
'TotalBsmtSF'
,
'1stFlrSF'
,
'2ndFlrSF'
,
'GrLivArea'
,
'BsmtFullBath'
,
'FullBath'
,
'HalfBath'
,
'GarageArea'
,
'MoSold'
,
'YrSold'
,
'MSZoning_index'
,
'LotFrontage_index'
,
'Street_index'
,
'LotShape_index'
,
'LandContour_index'
,
'Utilities_index'
,
'LotConfig_index'
,
'LandSlope_index'
,
'Neighborhood_index'
,
'Condition1_index'
,
'Condition2_index'
,
'BldgType_index'
,
'HouseStyle_index'
,
'RoofStyle_index'
,
'RoofMatl_index'
,
'Exterior1st_index'
,
'Exterior2nd_index'
,
'MasVnrType_index'
,
'MasVnrArea_index'
,
'ExterQual_index'
,
'ExterCond_index'
,
'Foundation_index'
,
'BsmtQual_index'
,
'BsmtCond_index'
,
'BsmtExposure_index'
,
'BsmtFinType1_index'
,
'BsmtFinType2_index'
,
'Heating_index'
,
'HeatingQC_index'
,
'CentralAir_index'
,
'Electrical_index'
,
'KitchenQual_index'
,
'Functional_index'
,
'FireplaceQu_index'
,
'GarageType_index'
,
'GarageYrBlt_index'
,
'GarageFinish_index'
,
'GarageQual_index'
,
'GarageCond_index'
,
'PavedDrive_index'
,
'SaleType_index'
,
'SaleCondition_index'
],
outputCol
=
'features'
)
output
=
assembler.transform(df_feat)
final_data
=
output.select(
'features'
,
'SalePrice'
)
train_data, test_data
=
final_data.randomSplit([
0.7
,
0.3
])