# import the librairies

import pandas as pd
import numpy as np
from sklearn import tree
train.describe()
train.shape
(891,12)


How many people in your training set survived the disaster with the Titanic? To see this, you can use the value_counts() method in combination with standard bracket notation to select a single column of a DataFrame:

# Passengers that survived vs passengers that passed away

survived_num = train["Survived"].value_counts()
print(survived_num)

0    549
1    342
Name: Survived, dtype: int64


# As proportions

percentage = train["Survived"].value_counts(normalize=True)
print(percentage)

0    0.616162
1    0.383838
Name: Survived, dtype: float64


# Males that survived vs males that passed away

male_survived = train["Survived"][train["Sex"] == 'male'].value_counts()
print(male_survived)

0    468
1    109
Name: Survived, dtype: int64


# Females that survived vs Females that passed away

female_survived = train["Survived"][train["Sex"] == 'female'].value_counts()
print(female_survived)

1    233
0     81
Name: Survived, dtype: int64


# Normalized male survival

nor_male = train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)
print(nor_male)

0    0.811092
1    0.188908

Name: Survived, dtype: float64


# Normalized female survival

nor_female = train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)
print(nor_female)

1    0.742038
0    0.257962
Name: Survived, dtype: float64


# Does Age play a role ?

Another variable that could influence survival is age; since it's probable that children were saved first. You can test this by creating a new column with a categorical variable Child. Child will take the value 1 in cases where age is less than 18, and a value of 0 in cases where age is greater than or equal to 18.

To add this new variable you need to do two things (i) create a new column, and (ii) provide the values for each observation (i.e., row) based on the age of the passenger.

Adding a new column with Pandas in Python is easy and can be done via the following syntax:

train["Child"] = float('NaN')


# Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column.

train["Child"][train["Age"] < 18] = 1
train["Child"][train["Age"] >= 18] = 0


# Print normalized Survival Rates for passengers under 18

print (train["Survived"][train["Child"] == 1].value_counts( normalize =  True))
1    0.539823
0    0.460177
Name: Survived, dtype: float64


# Print normalized Survival Rates for passengers 18 or older

print(train["Survived"][train["Child"] == 0].value_counts( normalize =   True))
0    0.618968
1    0.381032
Name: Survived, dtype: float64


# Convert the male and female groups to integer form

train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1

# Impute the Embarked variable

train["Embarked"] = train["Embarked"].fillna("S")


# Convert the Embarked classes to integer form

train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2


# Print the Sex and Embarked columns

print(train["Sex"])
print(train["Embarked"])


# Print the train data to see the available features

print(train)


# Create the target and features numpy arrays: target, features_one

target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values


# Fit your first decision tree: my_tree_one

my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one,target)


# Look at the importance and score of the included features

print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one,target))


