-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
33 lines (28 loc) · 1.93 KB
/
preprocessing.py
File metadata and controls
33 lines (28 loc) · 1.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
"""
This function is to cover all the preprocessing steps on the churn dataframe.
It involves selecting important features, encoding categorical data, handling missing values,feature scaling and splitting the data
"""
def preprocess(df,option):
def binary_map(feature):
return feature.map({'Yes':1, 'No':0})
# Encode binary categorical features
binary_list = ['SeniorCitizen','Dependents', 'PhoneService', 'PaperlessBilling']
df[binary_list] = df[binary_list].apply(binary_map)
columns = ['SeniorCitizen', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'MultipleLines_No_phone_service', 'MultipleLines_Yes', 'InternetService_Fiber_optic', 'InternetService_No', 'OnlineSecurity_No_internet_service', 'OnlineSecurity_Yes', 'OnlineBackup_No_internet_service', 'TechSupport_No_internet_service', 'TechSupport_Yes', 'StreamingTV_No_internet_service', 'StreamingTV_Yes', 'StreamingMovies_No_internet_service', 'StreamingMovies_Yes', 'Contract_One_year', 'Contract_Two_year', 'PaymentMethod_Electronic_check']
#Drop values based on operational options
if (option == "Online"):
df = pd.get_dummies(df).reindex(columns=columns, fill_value=0)
elif (option == "Batch"):
df = df[['SeniorCitizen','Dependents','tenure','PhoneService','MultipleLines','InternetService','OnlineSecurity',
'OnlineBackup','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod',
'MonthlyCharges','TotalCharges']]
df = pd.get_dummies(df).reindex(columns=columns, fill_value=0)
else:
print("Incorrect operational options")
sc = MinMaxScaler()
df['tenure'] = sc.fit_transform(df[['tenure']])
df['MonthlyCharges'] = sc.fit_transform(df[['MonthlyCharges']])
df['TotalCharges'] = sc.fit_transform(df[['TotalCharges']])
return df