USING SKLEARN PIPELINES AND VOTING CLASSIFIER

apr. 04, 2020
·
samuel mignot

Data Extraction

In [23]:
import warnings
warnings.filterwarnings('ignore')
In [16]:
import numpy as np
import pandas as pd
from mlxtend.plotting import plot_decision_regions

import os
from pprint import pprint

dfs = []
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        if(filename in {'train.csv', 'test.csv'}):
            dfs.append(pd.read_csv(os.path.join(dirname, filename)))

test, train = dfs

Data Investigation

In [7]:
train.isnull().sum()
Out[7]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
In [8]:
train
Out[8]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

Feature Selection

In [9]:
X = train.drop(columns=['Survived', 'Ticket', 'Name'])
y = train.loc[:, ['Survived']]

General imports

In [25]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score

import joblib

Preprocessing Pipeline

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

numeric_features = ['Age', 'Fare']

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler()),
]) 

categorical_features = ['Embarked', 'Sex', 'Pclass', 'SibSp', 'Parch', 'Cabin']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Models

Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

params = {'classifier__C': np.linspace(0,1,10)}

grid = GridSearchCV(clf, param_grid=params, cv=10)

grid.fit(X, y.values.reshape(-1,))

print(grid.best_score_)
print(grid.best_params_)
0.8193258426966292
{'classifier__C': 0.8888888888888888}

Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

random_grid_rf = {'classifier__n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
               'classifier__max_features': ['auto', 'sqrt'],
               'classifier__max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],
               'classifier__min_samples_split': [2, 5, 10],
               'classifier__min_samples_leaf': [1, 2, 4],
               'classifier__bootstrap': [True, False]}


clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])


rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid_rf, n_iter = 200, cv = 10, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X, y.values.reshape(-1,))

print(rf_random.best_score_)
print(rf_random.best_params_)
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 22.0min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 27.5min finished
0.827191011235955
{'classifier__n_estimators': 200, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 80, 'classifier__bootstrap': False}
In [27]:
joblib.dump(rf_random.best_estimator_, 'best_rf.pickle')

solution_df = pd.DataFrame(np.column_stack((test.PassengerId.values, rf_random.best_estimator_.predict(test.drop(columns=['Ticket', 'Name'])))), columns=['PassengerId', 'Survived'])
solution_df.to_csv('solution_rf.csv', index=False)

SVC

In [28]:
from sklearn.svm import SVC

random_grid_svc = {'classifier__kernel': ['linear', 'rbf', 'poly'],
               'classifier__gamma': [0.1, 1, 10, 20],
               'classifier__C': [0.1, 1, 10, 100],
              }

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', SVC())])

svc_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid_svc, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

svc_random.fit(X, y.values.reshape(-1,))

print(svc_random.best_score_)
print(svc_random.best_params_)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.9s
0.8204264870931537
{'classifier__kernel': 'poly', 'classifier__gamma': 0.1, 'classifier__C': 1}
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 22.0min finished
In [29]:
joblib.dump(svc_random.best_estimator_, 'best_svc.pickle')

solution_df_svc = pd.DataFrame(np.column_stack((test.PassengerId.values, svc_random.best_estimator_.predict(test.drop(columns=['Ticket', 'Name'])))), columns=['PassengerId', 'Survived'])
solution_df_svc.to_csv('solution_svc.csv', index=False)

MLP

In [30]:
from sklearn.neural_network import MLPClassifier

random_grid_mlp = {
                   'classifier__hidden_layer_sizes': [32, 64, 128, 256, 512],
                   'classifier__activation': ['identity', 'logistic', 'tanh', 'relu'],
                   'classifier__solver': ['lbfgs', 'sgd', 'adam'],
                   'classifier__alpha': [.00001, .0001, .001, .01],
                   'classifier__learning_rate': ['constant', 'invscaling', 'adaptive'],
                }


clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', MLPClassifier(max_iter=500))])


mlp_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid_mlp, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

mlp_random.fit(X, y.values.reshape(-1,))

print(mlp_random.best_score_)
print(mlp_random.best_params_)
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  7.4min finished
0.8114478114478114
{'classifier__solver': 'adam', 'classifier__learning_rate': 'adaptive', 'classifier__hidden_layer_sizes': 32, 'classifier__alpha': 0.01, 'classifier__activation': 'tanh'}
In [31]:
joblib.dump(mlp_random.best_estimator_, 'best_mlp.pickle')

solution_df_mlp = pd.DataFrame(np.column_stack((test.PassengerId.values, mlp_random.best_estimator_.predict(test.drop(columns=['Ticket', 'Name'])))), columns=['PassengerId', 'Survived'])
solution_df_mlp.to_csv('solution_mlp.csv', index=False)

Voting Classifier

In [32]:
v = VotingClassifier([('log', grid.best_estimator_), ('rf', rf_random.best_estimator_), ('svc', svc_random.best_estimator_), ('mlp', mlp_random.best_estimator_)])
In [34]:
v.fit(X, y.values.reshape(-1,))
print(accuracy_score(v.predict(X), y))
0.8552188552188552
In [36]:
joblib.dump(v, 'best_vc.pickle')

solution_df_v = pd.DataFrame(np.column_stack((test.PassengerId.values, v.predict(test.drop(columns=['Ticket', 'Name'])))), columns=['PassengerId', 'Survived'])
solution_df_v.to_csv('solution_voting_classifier.csv', index=False)

Submit to Kaggle

In [41]:
import os
os.system("kaggle competitions submit -c titanic -f submission.csv -m 'Voting Classifier with sklearn Pipeline preprocessing'")
Out[41]:
32512
In [ ]: