# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
digit-recognizer

###

df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
train_images = df.drop('label',axis=1)
labels = df['label']
train_images.head()
pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 784 columns

labels.head()
0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64
#Here is a very poorly way to see test_images and their respective labels as titles
fig,axes = plt.subplots(nrows=3,ncols=3)
image_index = 0
for x in range(0,3): 
    for y in range(0,3):
        axes[x,y].set_title(labels[image_index])
        axes[x,y].imshow(train_images.iloc[image_index].values.reshape(28,28))
        axes[x,y].axis('off')
        image_index+=1
plt.tight_layout()

plt.figure(figsize=(10,6))
sns.countplot(labels)
plt.grid()
/opt/conda/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_images, labels, test_size=0.33, random_state=42)
from sklearn.tree import DecisionTreeClassifier
'''
Since we don't have a equal distribution of numbers
per class we are going to normalize the weights with class_weight
'''
dtree = DecisionTreeClassifier(class_weight='balanced')
dtree.fit(X_train,y_train)
DecisionTreeClassifier(class_weight='balanced')
predictions = dtree.predict(X_test)
print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      1333
           1       0.93      0.95      0.94      1520
           2       0.84      0.84      0.84      1414
           3       0.83      0.81      0.82      1471
           4       0.84      0.85      0.84      1358
           5       0.78      0.79      0.78      1205
           6       0.88      0.86      0.87      1397
           7       0.88      0.89      0.88      1480
           8       0.79      0.79      0.79      1334
           9       0.80      0.80      0.80      1348

    accuracy                           0.85     13860
   macro avg       0.85      0.85      0.85     13860
weighted avg       0.85      0.85      0.85     13860

from sklearn.ensemble import RandomForestClassifier
rdc = RandomForestClassifier(n_estimators=100)
rdc.fit(X_train,y_train)
RandomForestClassifier()
predictions = rdc.predict(X_test)
print(classification_report(y_test,predictions))
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1333
           1       0.98      0.99      0.98      1520
           2       0.96      0.96      0.96      1414
           3       0.95      0.93      0.94      1471
           4       0.96      0.97      0.96      1358
           5       0.97      0.95      0.96      1205
           6       0.97      0.98      0.97      1397
           7       0.97      0.95      0.96      1480
           8       0.95      0.95      0.95      1334
           9       0.93      0.94      0.94      1348

    accuracy                           0.96     13860
   macro avg       0.96      0.96      0.96     13860
weighted avg       0.96      0.96      0.96     13860

X_train.shape, X_test.shape
((28140, 784), (13860, 784))
predictions.shape
(13860,)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test, predictions)
0.9608225108225108
df_test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
df_submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
input_features = [col for col in df]
print(len(input_features))
input_features.remove("label")
len(input_features)
785
784
submission_X = df_test[input_features]
submission_y = rdc.predict(submission_X)

df_submission['Label'] = submission_y
df_submission.to_csv("submission2.csv", index = False)

댓글남기기