[PyTorch] kaggle Digit Recognizer Decision Tree Classifier
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/digit-recognizer/sample_submission.csv /kaggle/input/digit-recognizer/train.csv /kaggle/input/digit-recognizer/test.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
digit-recognizer
###
df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
train_images = df.drop('label',axis=1)
labels = df['label']
train_images.head()
pixel0 | pixel1 | pixel2 | pixel3 | pixel4 | pixel5 | pixel6 | pixel7 | pixel8 | pixel9 | ... | pixel774 | pixel775 | pixel776 | pixel777 | pixel778 | pixel779 | pixel780 | pixel781 | pixel782 | pixel783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 784 columns
labels.head()
0 1 1 0 2 1 3 4 4 0 Name: label, dtype: int64
#Here is a very poorly way to see test_images and their respective labels as titles
fig,axes = plt.subplots(nrows=3,ncols=3)
image_index = 0
for x in range(0,3):
for y in range(0,3):
axes[x,y].set_title(labels[image_index])
axes[x,y].imshow(train_images.iloc[image_index].values.reshape(28,28))
axes[x,y].axis('off')
image_index+=1
plt.tight_layout()
plt.figure(figsize=(10,6))
sns.countplot(labels)
plt.grid()
/opt/conda/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_images, labels, test_size=0.33, random_state=42)
from sklearn.tree import DecisionTreeClassifier
'''
Since we don't have a equal distribution of numbers
per class we are going to normalize the weights with class_weight
'''
dtree = DecisionTreeClassifier(class_weight='balanced')
dtree.fit(X_train,y_train)
DecisionTreeClassifier(class_weight='balanced')
predictions = dtree.predict(X_test)
print(classification_report(y_test,predictions))
precision recall f1-score support 0 0.93 0.92 0.93 1333 1 0.93 0.95 0.94 1520 2 0.84 0.84 0.84 1414 3 0.83 0.81 0.82 1471 4 0.84 0.85 0.84 1358 5 0.78 0.79 0.78 1205 6 0.88 0.86 0.87 1397 7 0.88 0.89 0.88 1480 8 0.79 0.79 0.79 1334 9 0.80 0.80 0.80 1348 accuracy 0.85 13860 macro avg 0.85 0.85 0.85 13860 weighted avg 0.85 0.85 0.85 13860
from sklearn.ensemble import RandomForestClassifier
rdc = RandomForestClassifier(n_estimators=100)
rdc.fit(X_train,y_train)
RandomForestClassifier()
predictions = rdc.predict(X_test)
print(classification_report(y_test,predictions))
precision recall f1-score support 0 0.97 0.99 0.98 1333 1 0.98 0.99 0.98 1520 2 0.96 0.96 0.96 1414 3 0.95 0.93 0.94 1471 4 0.96 0.97 0.96 1358 5 0.97 0.95 0.96 1205 6 0.97 0.98 0.97 1397 7 0.97 0.95 0.96 1480 8 0.95 0.95 0.95 1334 9 0.93 0.94 0.94 1348 accuracy 0.96 13860 macro avg 0.96 0.96 0.96 13860 weighted avg 0.96 0.96 0.96 13860
X_train.shape, X_test.shape
((28140, 784), (13860, 784))
predictions.shape
(13860,)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test, predictions)
0.9608225108225108
df_test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
df_submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
input_features = [col for col in df]
print(len(input_features))
input_features.remove("label")
len(input_features)
785
784
submission_X = df_test[input_features]
submission_y = rdc.predict(submission_X)
df_submission['Label'] = submission_y
df_submission.to_csv("submission2.csv", index = False)
댓글남기기