train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
gender = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
df_train = pd.DataFrame(train)
import matplotlib.pyplot as plt
# plt.figure(12.5)
plt.plot(train['Sex'], train['Survived'])

female = train.loc[train.Sex =='female']
male = train.loc[train.Sex =='male']
plt.figure(figsize=(10, 6))
plt.hist(female.Age, bins=10)
plt.hist(male.Age, bins=10, color='orange')

len(train[train["Survived"] == 0]), len(train[train["Survived"] == 1])
(549, 342)
0 549
1 342
Name: Survived, dtype: int64
train[train["Pclass"] == 1]
3 491
1 216
2 184
Name: Pclass, dtype: int64
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
for col in train:
input_features = [col for col in train]
output_feature = ["Survived"]
useless_features = ['PassengerId', 'Name', 'Survived', 'Ticket', 'Cabin']
for f in useless_features:
X, y = train[input_features], train[output_feature]
# X['Sex'] = X['Sex'].astype("category").cat.codes
X["Sex"] = X["Sex"].astype("category").cat.codes
X["Embarked"] = X["Embarked"].astype("category").cat.codes
array([ 2, 0, 1, -1], dtype=int8)
Pclass 0
Sex 0
Age 177
SibSp 0
Parch 0
Fare 0
Embarked 0
dtype: int64
Pclass 0
Sex 0
Age 0
SibSp 0
Parch 0
Fare 0
Embarked 0
dtype: int64
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((712, 7), (712, 1), (179, 7), (179, 1))
((891, 7), (891, 1))
4. Decision Tree Classifier
4.1 Decision Tree Classifier 사용하여 모델 트레이닝 시키기
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=0, max_depth=3)
model.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=3, random_state=0)
4.2 sklearn을 활용하여 accuracy출력, confusion matrix 그리기, classification report 생성해보기, feature importance 뽑아보기
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def summarize(model, X, y):
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
summarize(model,X_test, y_test)
precision recall f1-score support
0 0.80 0.92 0.85 105
1 0.86 0.66 0.75 74
accuracy 0.82 179
macro avg 0.83 0.79 0.80 179
weighted avg 0.82 0.82 0.81 179
[[97 8]
[25 49]]
plt.bar(x = X.columns, height = model.feature_importances_)
<BarContainer object of 7 artists>

4.3 sklearn.tree에 내장된 plot_tree를 이용하여, 생성한 Decision Tree 모델 plot하기
import matplotlib.pyplot as plt
from sklearn import tree
plt.figure( figsize=(20,15) )
feature_names = X.columns,
impurity=True, filled=True,
rounded=True, max_depth=24)

summarize(model,X_test, y_test)
precision recall f1-score support
0 0.80 0.92 0.85 105
1 0.86 0.66 0.75 74
accuracy 0.82 179
macro avg 0.83 0.79 0.80 179
weighted avg 0.82 0.82 0.81 179
[[97 8]
[25 49]]
y_train.shape, y_test.shape
((712, 1), (179, 1))
X_train.shape, X_test.shape
((712, 7), (179, 7))
5. Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
max_leaf_nodes = range(20, 30)
criterion = ["gini", "entropy"]
params = {"max_leaf_nodes": max_leaf_nodes, "criterion": criterion}
tree_grid = GridSearchCV(DecisionTreeClassifier(), params, cv=5, n_jobs=-1, verbose=1, scoring="accuracy")
tree_grid.fit(X_train, y_train)
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 2.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 3.0s finished
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
param_grid={'criterion': ['gini', 'entropy'],
'max_leaf_nodes': range(20, 30)},
scoring='accuracy', verbose=1)
DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=28)
{'criterion': 'entropy', 'max_leaf_nodes': 28}
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")
submit_x = df_test[input_features]
submit_x["Sex"] = submit_x["Sex"].astype("category").cat.codes
submit_x["Embarked"] = submit_x["Embarked"].astype("category").cat.codes
submit_x = submit_x.fillna(-1)
sumbit_y = model.predict(submit_x)
1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])
df_submit = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
df_submit["Survived"] = sumbit_y
df_submit.to_csv("submit2.csv", index=False)