# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
train = pd.read_csv("/kaggle/input/titanic/train.csv")
train
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

test = pd.read_csv("/kaggle/input/titanic/test.csv")
test
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 Spector, Mr. Woolf male NaN 0 0 A.5. 3236 8.0500 NaN S
414 1306 1 Oliva y Ocana, Dona. Fermina female 39.0 0 0 PC 17758 108.9000 C105 C
415 1307 3 Saether, Mr. Simon Sivertsen male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S
416 1308 3 Ware, Mr. Frederick male NaN 0 0 359309 8.0500 NaN S
417 1309 3 Peter, Master. Michael J male NaN 1 1 2668 22.3583 NaN C

418 rows × 11 columns

gender = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
gender
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
... ... ...
413 1305 0
414 1306 1
415 1307 0
416 1308 0
417 1309 0

418 rows × 2 columns

train.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
df_train = pd.DataFrame(train)
df_train
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

import matplotlib.pyplot as plt
# plt.figure(12.5)
plt.plot(train['Sex'], train['Survived'])
plt.xlabel('sex')
plt.ylabel('survived')
plt.show()

female = train.loc[train.Sex =='female']
male = train.loc[train.Sex =='male']
male
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
883 884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 C.A./SOTON 34068 10.5000 NaN S
884 885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 SOTON/OQ 392076 7.0500 NaN S
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

577 rows × 12 columns

plt.figure(figsize=(10, 6))
plt.subplot(121)
plt.title("female")
plt.hist(female.Age, bins=10)

plt.subplot(122)
plt.title("male")
plt.hist(male.Age, bins=10, color='orange')
plt.show()

len(train[train["Survived"] == 0]), len(train[train["Survived"] == 1])
(549, 342)
train["Survived"].value_counts()
0    549
1    342
Name: Survived, dtype: int64
train[train["Pclass"] == 1]
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
11 12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.5500 C103 S
23 24 1 1 Sloper, Mr. William Thompson male 28.0 0 0 113788 35.5000 A6 S
... ... ... ... ... ... ... ... ... ... ... ... ...
871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1 1 11751 52.5542 D35 S
872 873 0 1 Carlsson, Mr. Frans Olof male 33.0 0 0 695 5.0000 B51 B53 B55 S
879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 11767 83.1583 C50 C
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C

216 rows × 12 columns

train["Pclass"].value_counts()
3    491
1    216
2    184
Name: Pclass, dtype: int64
train.isnull().sum()
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
for col in train:
    print(col)
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked
input_features = [col for col in train]
input_features
['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']
output_feature = ["Survived"]
useless_features = ['PassengerId', 'Name', 'Survived', 'Ticket', 'Cabin']

for f in useless_features:
    input_features.remove(f)
X, y = train[input_features], train[output_feature]
X
Pclass Sex Age SibSp Parch Fare Embarked
0 3 male 22.0 1 0 7.2500 S
1 1 female 38.0 1 0 71.2833 C
2 3 female 26.0 0 0 7.9250 S
3 1 female 35.0 1 0 53.1000 S
4 3 male 35.0 0 0 8.0500 S
... ... ... ... ... ... ... ...
886 2 male 27.0 0 0 13.0000 S
887 1 female 19.0 0 0 30.0000 S
888 3 female NaN 1 2 23.4500 S
889 1 male 26.0 0 0 30.0000 C
890 3 male 32.0 0 0 7.7500 Q

891 rows × 7 columns

# X['Sex'] = X['Sex'].astype("category").cat.codes

X["Sex"] = X["Sex"].astype("category").cat.codes
X["Embarked"] = X["Embarked"].astype("category").cat.codes
X
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
Pclass Sex Age SibSp Parch Fare Embarked
0 3 1 22.0 1 0 7.2500 2
1 1 0 38.0 1 0 71.2833 0
2 3 0 26.0 0 0 7.9250 2
3 1 0 35.0 1 0 53.1000 2
4 3 1 35.0 0 0 8.0500 2
... ... ... ... ... ... ... ...
886 2 1 27.0 0 0 13.0000 2
887 1 0 19.0 0 0 30.0000 2
888 3 0 NaN 1 2 23.4500 2
889 1 1 26.0 0 0 30.0000 0
890 3 1 32.0 0 0 7.7500 1

891 rows × 7 columns

X['Embarked'].unique()
array([ 2,  0,  1, -1], dtype=int8)
X[X['Embarked'] == -1]
Pclass Sex Age SibSp Parch Fare Embarked
61 1 0 38.0 0 0 80.0 -1
829 1 0 62.0 0 0 80.0 -1
X.isnull().sum()
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      0
dtype: int64
X = X.fillna(-1)
X.isnull().sum()
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
((712, 7), (712, 1), (179, 7), (179, 1))
X.shape, y.shape
((891, 7), (891, 1))

4. Decision Tree Classifier

4.1 Decision Tree Classifier 사용하여 모델 트레이닝 시키기

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=0, max_depth=3)
model.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=3, random_state=0)

4.2 sklearn을 활용하여 accuracy출력, confusion matrix 그리기, classification report 생성해보기, feature importance 뽑아보기

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def summarize(model, X, y):
    y_pred = model.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

summarize(model,X_test, y_test)
0.8156424581005587
              precision    recall  f1-score   support

           0       0.80      0.92      0.85       105
           1       0.86      0.66      0.75        74

    accuracy                           0.82       179
   macro avg       0.83      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179

[[97  8]
 [25 49]]
plt.bar(x = X.columns, height = model.feature_importances_)
<BarContainer object of 7 artists>

4.3 sklearn.tree에 내장된 plot_tree를 이용하여, 생성한 Decision Tree 모델 plot하기

import matplotlib.pyplot as plt
from sklearn import tree



plt.figure( figsize=(20,15) )
tree.plot_tree(model, 
               feature_names = X.columns,
               impurity=True, filled=True,
               rounded=True, max_depth=24)
plt.show()

summarize(model,X_test, y_test)
0.8156424581005587
              precision    recall  f1-score   support

           0       0.80      0.92      0.85       105
           1       0.86      0.66      0.75        74

    accuracy                           0.82       179
   macro avg       0.83      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179

[[97  8]
 [25 49]]
y_train.shape, y_test.shape
((712, 1), (179, 1))
X_train.shape, X_test.shape
((712, 7), (179, 7))

5. Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV
max_leaf_nodes = range(20, 30)
criterion = ["gini", "entropy"]

params = {"max_leaf_nodes": max_leaf_nodes, "criterion": criterion}

tree_grid = GridSearchCV(DecisionTreeClassifier(), params, cv=5, n_jobs=-1, verbose=1, scoring="accuracy")
tree_grid.fit(X_train, y_train)
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.0s finished
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_leaf_nodes': range(20, 30)},
             scoring='accuracy', verbose=1)
tree_grid.best_estimator_
DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=28)
tree_grid.best_params_
{'criterion': 'entropy', 'max_leaf_nodes': 28}
df_test = pd.read_csv("/kaggle/input/titanic/test.csv")

submit_x = df_test[input_features]
submit_x["Sex"] = submit_x["Sex"].astype("category").cat.codes
submit_x["Embarked"] = submit_x["Embarked"].astype("category").cat.codes
submit_x = submit_x.fillna(-1)
sumbit_y = model.predict(submit_x)
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
sumbit_y
array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])
submit_x
Pclass Sex Age SibSp Parch Fare Embarked
0 3 1 34.5 0 0 7.8292 1
1 3 0 47.0 1 0 7.0000 2
2 2 1 62.0 0 0 9.6875 1
3 3 1 27.0 0 0 8.6625 2
4 3 0 22.0 1 1 12.2875 2
... ... ... ... ... ... ... ...
413 3 1 -1.0 0 0 8.0500 2
414 1 0 39.0 0 0 108.9000 0
415 3 1 38.5 0 0 7.2500 2
416 3 1 -1.0 0 0 8.0500 2
417 3 1 -1.0 1 1 22.3583 0

418 rows × 7 columns

df_submit = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
df_submit["Survived"] = sumbit_y
df_submit
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
... ... ...
413 1305 0
414 1306 1
415 1307 0
416 1308 0
417 1309 0

418 rows × 2 columns

df_submit.to_csv("submit2.csv", index=False)

댓글남기기