# Predicting who will survive on the Titanic 

This notebook is based on a Kaggle competition where the goal is to predict survival on the Titanic, based on real data. Kaggle hosts machine learning competitions where anyone can download a dataset, train a model, and test the predictions on the website. The author of the best model wins a prize. We will look at using the decision tree to predict survival

In [None]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn.externals.six import StringIO
import pydot
from IPython.display import Image
from sklearn import metrics
%matplotlib inline

titanic = pd.read_csv('data/titanic.csv')
titanic.head()

In [None]:
# let's drop name and ticket
titanic.drop(titanic.columns[[3, 8]], axis=1, inplace=True)
# for ease let's drop na
titanic = titanic.dropna()
titanic.head()

In [None]:
# dummy code the variables
cabinDummies = pd.get_dummies(titanic.Cabin, prefix='Cabin').iloc[:, 1:]
embarkedDummies = pd.get_dummies(titanic.Embarked, prefix='Embarked').iloc[:, 1:]
sexDummies = pd.get_dummies(titanic.Sex).iloc[:, 1:]
# concatenate the dummy variables and drop the duplicates
titanicDF = pd.concat([titanic, cabinDummies, embarkedDummies, sexDummies], axis=1)
titanicDF.drop(titanicDF.columns[[3, 8, 9]], axis=1, inplace=True)
titanicDF.columns

In [None]:
# split into 60-40 train/test
y = titanicDF.Survived.values
X = titanicDF.drop(titanicDF.columns[[1]], axis=1)
trainX, testX, trainY, testY = model_selection.train_test_split(X, y, test_size=0.4, random_state=5)

In [None]:
clf = tree.DecisionTreeClassifier()
clf.fit(trainX, trainY)

In [None]:
dot_data = StringIO()  
tree.export_graphviz(clf, out_file=dot_data,  
                     feature_names=trainX.columns,  
                     class_names=["dead", "alive"],  
                     filled=True, rounded=True, special_characters=True)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())  

In [None]:
yTrainHat = clf.predict(trainX)
yTestHat = clf.predict(testX)
print "Train", metrics.classification_report(trainY, yTrainHat)
print "Test", metrics.classification_report(testY, yTestHat)