import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))


df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
train_images = df.drop('label',axis=1)
labels = df['label']
pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 784 columns

0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64
#Here is a very poorly way to see test_images and their respective labels as titles
fig,axes = plt.subplots(nrows=3,ncols=3)
image_index = 0
for x in range(0,3): 
    for y in range(0,3):

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_images, labels, test_size=0.33, random_state=42)
from sklearn.tree import DecisionTreeClassifier
Since we don't have a equal distribution of numbers
per class we are going to normalize the weights with class_weight
dtree = DecisionTreeClassifier(class_weight='balanced')
predictions = dtree.predict(X_test)
              precision    recall  f1-score   support

           0       0.93      0.92      0.93      1333
           1       0.93      0.95      0.94      1520
           2       0.84      0.84      0.84      1414
           3       0.83      0.81      0.82      1471
           4       0.84      0.85      0.84      1358
           5       0.78      0.79      0.78      1205
           6       0.88      0.86      0.87      1397
           7       0.88      0.89      0.88      1480
           8       0.79      0.79      0.79      1334
           9       0.80      0.80      0.80      1348

    accuracy                           0.85     13860
   macro avg       0.85      0.85      0.85     13860
weighted avg       0.85      0.85      0.85     13860

from sklearn.ensemble import RandomForestClassifier
rdc = RandomForestClassifier(n_estimators=100)
predictions = rdc.predict(X_test)
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1333
           1       0.98      0.99      0.98      1520
           2       0.96      0.96      0.96      1414
           3       0.95      0.93      0.94      1471
           4       0.96      0.97      0.96      1358
           5       0.97      0.95      0.96      1205
           6       0.97      0.98      0.97      1397
           7       0.97      0.95      0.96      1480
           8       0.95      0.95      0.95      1334
           9       0.93      0.94      0.94      1348

    accuracy                           0.96     13860
   macro avg       0.96      0.96      0.96     13860
weighted avg       0.96      0.96      0.96     13860

X_train.shape, X_test.shape
((28140, 784), (13860, 784))
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test, predictions)
df_test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
df_submission = pd.read_csv('/kaggle/input/digit-recognizer/sample_submission.csv')
input_features = [col for col in df]
submission_X = df_test[input_features]
submission_y = rdc.predict(submission_X)

df_submission['Label'] = submission_y
df_submission.to_csv("submission2.csv", index = False)
