[PyTorch] kaggle Digit Recognizer Decision Tree Classifier

January 11, 2022

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

digit-recognizer

###

df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
train_images = df.drop('label',axis=1)
labels = df['label']

train_images.head()

	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8	pixel9	...	pixel774	pixel775	pixel776	pixel777	pixel778	pixel779	pixel780	pixel781	pixel782	pixel783
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

5 rows × 784 columns

labels.head()

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64

#Here is a very poorly way to see test_images and their respective labels as titles
fig,axes = plt.subplots(nrows=3,ncols=3)
image_index = 0
for x in range(0,3): 
    for y in range(0,3):
        axes[x,y].set_title(labels[image_index])
        axes[x,y].imshow(train_images.iloc[image_index].values.reshape(28,28))
        axes[x,y].axis('off')
        image_index+=1
plt.tight_layout()

plt.figure(figsize=(10,6))
sns.countplot(labels)
plt.grid()

/opt/conda/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_images, labels, test_size=0.33, random_state=42)

from sklearn.tree import DecisionTreeClassifier

'''
Since we don't have a equal distribution of numbers
per class we are going to normalize the weights with class_weight
'''
dtree = DecisionTreeClassifier(class_weight='balanced')

dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight='balanced')

predictions = dtree.predict(X_test)

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93      1333
           1       0.93      0.95      0.94      1520
           2       0.84      0.84      0.84      1414
           3       0.83      0.81      0.82      1471
           4       0.84      0.85      0.84      1358
           5       0.78      0.79      0.78      1205
           6       0.88      0.86      0.87      1397
           7       0.88      0.89      0.88      1480
           8       0.79      0.79      0.79      1334
           9       0.80      0.80      0.80      1348

    accuracy                           0.85     13860
   macro avg       0.85      0.85      0.85     13860
weighted avg       0.85      0.85      0.85     13860

from sklearn.ensemble import RandomForestClassifier

rdc = RandomForestClassifier(n_estimators=100)

rdc.fit(X_train,y_train)

RandomForestClassifier()

predictions = rdc.predict(X_test)

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1333
           1       0.98      0.99      0.98      1520
           2       0.96      0.96      0.96      1414
           3       0.95      0.93      0.94      1471
           4       0.96      0.97      0.96      1358
           5       0.97      0.95      0.96      1205
           6       0.97      0.98      0.97      1397
           7       0.97      0.95      0.96      1480
           8       0.95      0.95      0.95      1334
           9       0.93      0.94      0.94      1348

    accuracy                           0.96     13860
   macro avg       0.96      0.96      0.96     13860
weighted avg       0.96      0.96      0.96     13860

X_train.shape, X_test.shape

((28140, 784), (13860, 784))

predictions.shape

(13860,)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy_score(y_test, predictions)

0.9608225108225108

df_test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
df_submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')

input_features = [col for col in df]
print(len(input_features))
input_features.remove("label")
len(input_features)

submission_X = df_test[input_features]
submission_y = rdc.predict(submission_X)

df_submission['Label'] = submission_y
df_submission.to_csv("submission2.csv", index = False)

Twitter Facebook LinkedIn

[PyTorch] kaggle Digit Recognizer Decision Tree Classifier

공유하기

댓글남기기

참고

[programmers] 110 옮기기

[programmers] 순위

[Hackerrank] The Grid Search

[Hackerrank] The Time in Words

	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8	pixel9	...	pixel774	pixel775	pixel776	pixel777	pixel778	pixel779	pixel780	pixel781	pixel782	pixel783
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8	pixel9	...	pixel774	pixel775	pixel776	pixel777	pixel778	pixel779	pixel780	pixel781	pixel782	pixel783
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	pixel0	pixel1	pixel2	pixel3	pixel4	pixel5	pixel6	pixel7	pixel8	pixel9	...	pixel774	pixel775	pixel776	pixel777	pixel778	pixel779	pixel780	pixel781	pixel782	pixel783
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0