[MNC] 대출자 채무 불이행 여부 예측 모델[최종본]
</br></br></br></br></br></br>
Ah-Jji?
</br></br></br></br></br></br>
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import catboost
import xgboost
import lightgbm
# from sklearn.dummy import DummyClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
# from sklearn.experimental import enable_hist_gradient_boosting
# from sklearn.neural_network import MLPClassifier
# from sklearn.ensemble import HistGradientBoostingClassifier
# from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier
# from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix
# from tpot import TPOTClassifier
/home/notitle/anaconda3/envs/loan/lib/python3.8/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import MultiIndex, Int64Index /home/notitle/anaconda3/envs/loan/lib/python3.8/site-packages/tpot/builtins/__init__.py:36: UserWarning: Warning: optional dependency `torch` is not available. - skipping import of NN models. warnings.warn("Warning: optional dependency `torch` is not available. - skipping import of NN models.")
df = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
df_submit = pd.read_csv("./data/sample_submission.csv")
df_train = df
df
int_rate | annual_inc | dti | delinq_2yrs | inq_last_6mths | pub_rec | revol_bal | total_acc | collections_12_mths_ex_med | acc_now_delinq | ... | term1 | open_acc | installment | revol_util | out_prncp | out_prncp_inv | total_rec_int | fico_range_low | fico_range_high | depvar | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0824 | 21000.0 | 29.19 | 0 | 1 | 0 | 3016 | 26 | 0 | 0 | ... | 1 | 18 | 37.74 | 0.076 | 0.0 | 0.0 | 157.94 | 765 | 769 | 0 |
1 | 0.1299 | 80000.0 | 4.82 | 0 | 1 | 1 | 5722 | 24 | 0 | 0 | ... | 1 | 8 | 269.52 | 0.447 | 0.0 | 0.0 | 1702.42 | 665 | 669 | 0 |
2 | 0.1299 | 38000.0 | 23.66 | 0 | 3 | 0 | 6511 | 18 | 0 | 0 | ... | 1 | 7 | 168.45 | 0.880 | 0.0 | 0.0 | 1066.64 | 670 | 674 | 0 |
3 | 0.1367 | 100000.0 | 16.27 | 4 | 2 | 0 | 6849 | 30 | 0 | 0 | ... | 1 | 12 | 510.27 | 0.457 | 0.0 | 0.0 | 1256.24 | 680 | 684 | 1 |
4 | 0.1269 | 30000.0 | 25.28 | 0 | 1 | 2 | 8197 | 12 | 0 | 0 | ... | 1 | 8 | 335.45 | 0.416 | 0.0 | 0.0 | 871.04 | 660 | 664 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99995 | 0.1757 | 65000.0 | 17.67 | 0 | 3 | 1 | 11255 | 21 | 1 | 0 | ... | 1 | 13 | 718.75 | 0.780 | 0.0 | 0.0 | 5373.29 | 660 | 664 | 1 |
99996 | 0.0890 | 65000.0 | 2.88 | 0 | 0 | 0 | 2105 | 12 | 0 | 0 | ... | 1 | 7 | 190.52 | 0.120 | 0.0 | 0.0 | 835.66 | 765 | 769 | 0 |
99997 | 0.1349 | 46000.0 | 32.12 | 0 | 1 | 0 | 8998 | 20 | 0 | 0 | ... | 1 | 19 | 217.16 | 0.643 | 0.0 | 0.0 | 1261.67 | 665 | 669 | 0 |
99998 | 0.2115 | 31000.0 | 4.53 | 0 | 1 | 0 | 3875 | 4 | 0 | 0 | ... | 1 | 3 | 207.64 | 0.731 | 0.0 | 0.0 | 1357.69 | 710 | 714 | 1 |
99999 | 0.1599 | 125000.0 | 33.33 | 0 | 0 | 0 | 34580 | 30 | 0 | 0 | ... | 1 | 19 | 1164.42 | 0.499 | 0.0 | 0.0 | 8882.58 | 690 | 694 | 0 |
100000 rows × 76 columns
따로 data preprocessing은 할 필요가 없습니다.
미리 진행 하였습니다.
모델 학습위주로 보시겠습니다.
# pd.options.display.max_rows = 80
# df_train.nunique()
# df_test.nunique()
X = df_train.drop("depvar", axis=1)
y = df_train["depvar"]
print(len(df_train.columns))
df_train.columns
76
Index(['int_rate', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'pub_rec', 'revol_bal', 'total_acc', 'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens', 'emp_length1', 'emp_length2', 'emp_length3', 'emp_length4', 'emp_length5', 'emp_length6', 'emp_length7', 'emp_length8', 'emp_length9', 'emp_length10', 'emp_length11', 'emp_length12', 'home_ownership1', 'home_ownership2', 'home_ownership3', 'home_ownership4', 'home_ownership5', 'home_ownership6', 'verification_status1', 'verification_status2', 'verification_status3', 'purpose1', 'purpose2', 'purpose3', 'purpose4', 'purpose5', 'purpose6', 'purpose7', 'purpose8', 'purpose9', 'purpose10', 'purpose11', 'purpose12', 'purpose13', 'purpose14', 'initial_list_status1', 'initial_list_status2', 'mths_since_last_delinq1', 'mths_since_last_delinq2', 'mths_since_last_delinq3', 'mths_since_last_delinq4', 'mths_since_last_delinq5', 'mths_since_last_delinq6', 'mths_since_last_delinq7', 'mths_since_last_delinq8', 'mths_since_last_delinq9', 'mths_since_last_delinq10', 'mths_since_last_delinq11', 'funded_amnt', 'funded_amnt_inv', 'total_rec_late_fee', 'term1', 'open_acc', 'installment', 'revol_util', 'out_prncp', 'out_prncp_inv', 'total_rec_int', 'fico_range_low', 'fico_range_high', 'depvar'], dtype='object')
df_train["emp_length"] = 0
emp_df = df_train.iloc[:, 15:27]
df_train["purpose"] = 0
pur_df = df_train.iloc[:, 36:50]
df_train["mths_since_last_delinq"] = 0
mths_df = df_train.iloc[:, 52:63]
df_train["home_ownership"] = 0
home_df = df_train.iloc[:, 27:33]
temp = list()
for i in tqdm(range(0,100000)):
str1 = "emp_length"
for j in range(1, 13):
str2 = str(j)
if emp_df[str1+str2][i] == 1:
temp.append(j)
break
df_train["emp_length"] = temp
temp = list()
for i in tqdm(range(0,100000)):
str1 = "purpose"
for j in range(1, 15):
str2 = str(j)
if pur_df[str1+str2][i] == 1:
temp.append(j)
break
df_train["purpose"] = temp
temp = list()
for i in tqdm(range(0,100000)):
str1 = "mths_since_last_delinq"
for j in range(1, 12):
str2 = str(j)
if mths_df[str1+str2][i] == 1:
temp.append(j)
break
df_train["mths_since_last_delinq"] = temp
temp = list()
for i in tqdm(range(0,100000)):
str1 = "home_ownership"
for j in range(1, 7):
str2 = str(j)
if home_df[str1+str2][i] == 1:
temp.append(j)
break
df_train["home_ownership"] = temp
100%|██████████| 100000/100000 [00:03<00:00, 29317.28it/s] 100%|██████████| 100000/100000 [00:02<00:00, 42521.94it/s] 100%|██████████| 100000/100000 [00:02<00:00, 39833.68it/s] 100%|██████████| 100000/100000 [00:03<00:00, 26846.15it/s]
Feature Engineering입니다. one-hot encode를 lable encode로 변경해보았습니다.
트리기반 모델을 학습할것이기에 안해도 됩니다!!!
df_train = df_train.drop(columns=[
'emp_length1', 'emp_length2', 'emp_length3', 'emp_length4',
'emp_length5', 'emp_length6', 'emp_length7', 'emp_length8',
'emp_length9', 'emp_length10', 'emp_length11', 'emp_length12',
'purpose1', 'purpose2', 'purpose3', 'purpose4',
'purpose5', 'purpose6', 'purpose7',
'purpose8', 'purpose9', 'purpose10', 'purpose11', 'purpose12',
'purpose13', 'purpose14',
'mths_since_last_delinq1','mths_since_last_delinq2',
'mths_since_last_delinq3', 'mths_since_last_delinq4',
'mths_since_last_delinq5', 'mths_since_last_delinq6',
'mths_since_last_delinq7', 'mths_since_last_delinq8',
'mths_since_last_delinq9', 'mths_since_last_delinq10',
'mths_since_last_delinq11',
'home_ownership1', 'home_ownership2', 'home_ownership3',
'home_ownership4', 'home_ownership5', 'home_ownership6',
])
# 선택사항
# df_train = df_train.drop(columns=[
# "initial_list_status2", "funded_amnt", "funded_amnt_inv", "out_prncp", "fico_range_high"
# ])
df_train
int_rate | annual_inc | dti | delinq_2yrs | inq_last_6mths | pub_rec | revol_bal | total_acc | collections_12_mths_ex_med | acc_now_delinq | ... | out_prncp | out_prncp_inv | total_rec_int | fico_range_low | fico_range_high | depvar | emp_length | purpose | mths_since_last_delinq | home_ownership | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0824 | 21000.0 | 29.19 | 0 | 1 | 0 | 3016 | 26 | 0 | 0 | ... | 0.0 | 0.0 | 157.94 | 765 | 769 | 0 | 1 | 2 | 1 | 6 |
1 | 0.1299 | 80000.0 | 4.82 | 0 | 1 | 1 | 5722 | 24 | 0 | 0 | ... | 0.0 | 0.0 | 1702.42 | 665 | 669 | 0 | 2 | 3 | 1 | 6 |
2 | 0.1299 | 38000.0 | 23.66 | 0 | 3 | 0 | 6511 | 18 | 0 | 0 | ... | 0.0 | 0.0 | 1066.64 | 670 | 674 | 0 | 4 | 3 | 11 | 6 |
3 | 0.1367 | 100000.0 | 16.27 | 4 | 2 | 0 | 6849 | 30 | 0 | 0 | ... | 0.0 | 0.0 | 1256.24 | 680 | 684 | 1 | 3 | 3 | 5 | 2 |
4 | 0.1269 | 30000.0 | 25.28 | 0 | 1 | 2 | 8197 | 12 | 0 | 0 | ... | 0.0 | 0.0 | 871.04 | 660 | 664 | 1 | 4 | 3 | 1 | 6 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99995 | 0.1757 | 65000.0 | 17.67 | 0 | 3 | 1 | 11255 | 21 | 1 | 0 | ... | 0.0 | 0.0 | 5373.29 | 660 | 664 | 1 | 7 | 3 | 1 | 6 |
99996 | 0.0890 | 65000.0 | 2.88 | 0 | 0 | 0 | 2105 | 12 | 0 | 0 | ... | 0.0 | 0.0 | 835.66 | 765 | 769 | 0 | 11 | 3 | 1 | 6 |
99997 | 0.1349 | 46000.0 | 32.12 | 0 | 1 | 0 | 8998 | 20 | 0 | 0 | ... | 0.0 | 0.0 | 1261.67 | 665 | 669 | 0 | 4 | 2 | 1 | 6 |
99998 | 0.2115 | 31000.0 | 4.53 | 0 | 1 | 0 | 3875 | 4 | 0 | 0 | ... | 0.0 | 0.0 | 1357.69 | 710 | 714 | 1 | 12 | 10 | 1 | 6 |
99999 | 0.1599 | 125000.0 | 33.33 | 0 | 0 | 0 | 34580 | 30 | 0 | 0 | ... | 0.0 | 0.0 | 8882.58 | 690 | 694 | 0 | 5 | 3 | 1 | 2 |
100000 rows × 37 columns
X_train1 = X[:20000]
X_train2 = X[20000:40000]
X_train3 = X[40000:60000]
X_train4 = X[60000:80000]
X_train5 = X[80000:]
y_train1 = y[:20000]
y_train2 = y[20000:40000]
y_train3 = y[40000:60000]
y_train4 = y[60000:80000]
y_train5 = y[80000:]
##########
frames = [X_train1, X_train2, X_train3, X_train4]
X_train_dataset1 = pd.concat(frames)
frames = [X_train1, X_train2, X_train3, X_train5]
X_train_dataset2 = pd.concat(frames)
frames = [X_train1, X_train2, X_train4, X_train5]
X_train_dataset3 = pd.concat(frames)
frames = [X_train1, X_train3, X_train4, X_train5]
X_train_dataset4 = pd.concat(frames)
frames = [X_train2, X_train3, X_train4, X_train5]
X_train_dataset5 = pd.concat(frames)
##########
frames = [y_train1, y_train2, y_train3, y_train4]
y_train_dataset1 = pd.concat(frames)
frames = [y_train1, y_train2, y_train3, y_train5]
y_train_dataset2 = pd.concat(frames)
frames = [y_train1, y_train2, y_train4, y_train5]
y_train_dataset3 = pd.concat(frames)
frames = [y_train1, y_train3, y_train4, y_train5]
y_train_dataset4 = pd.concat(frames)
frames = [y_train2, y_train3, y_train4, y_train5]
y_train_dataset5 = pd.concat(frames)
이번 과제를 진행함에 있어서, 핵심적인 방법으로써, 모델의 학습용 데이터를 섞어주는것입니다.
위의 코드로 설명을 하자면, 먼저 전체 데이터를 5개의 학습데이터로 나누어 줍니다.
1번모델에서는 1,2,3,4의 학습데이터(전체의 80%)로 학습을 합니다.
1번모델에서는 1,2,3,5의 학습데이터(전체의 80%)로 학습을 합니다.
1번모델에서는 1,2,4,5의 학습데이터(전체의 80%)로 학습을 합니다.
1번모델에서는 1,3,4,5의 학습데이터(전체의 80%)로 학습을 합니다.
1번모델에서는 2,3,4,5의 학습데이터(전체의 80%)로 학습을 합니다.
이렇게 모델을 학습시키면,
1번모델은 5번데이터로 검증을 하고,
2번모델은 4번데이터로 검증을 하고,
3번모델은 3번데이터로 검증을 하고,
4번모델은 2번데이터로 검증을 하고,
5번모델은 1번데이터로 검증을 하고,
최종 제출시에는 동일한 테스트 데이터들을 각 모델별로 학습을 시켜준 다음,
과반이상(3개)의 동일한 label값을 제출해주게 되면 됩니다.
model1 = xgboost.XGBClassifier()
model2 = xgboost.XGBClassifier()
model3 = xgboost.XGBClassifier()
model4 = xgboost.XGBClassifier()
model5 = xgboost.XGBClassifier()
model1.fit(X_train_dataset1, y_train_dataset1)
model2.fit(X_train_dataset2, y_train_dataset2)
model3.fit(X_train_dataset3, y_train_dataset3)
model4.fit(X_train_dataset4, y_train_dataset4)
model5.fit(X_train_dataset5, y_train_dataset5)
/home/notitle/anaconda3/envs/loan/lib/python3.8/site-packages/xgboost/sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning) /home/notitle/anaconda3/envs/loan/lib/python3.8/site-packages/xgboost/data.py:250: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
[10:54:49] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [10:55:04] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [10:55:17] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [10:55:29] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [10:55:41] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None, interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=4, num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact', validate_parameters=1, verbosity=None)
model6 = lightgbm.LGBMClassifier()
model7 = lightgbm.LGBMClassifier()
model8 = lightgbm.LGBMClassifier()
model9 = lightgbm.LGBMClassifier()
model10 = lightgbm.LGBMClassifier()
model6.fit(X_train_dataset1, y_train_dataset1)
model7.fit(X_train_dataset2, y_train_dataset2)
model8.fit(X_train_dataset3, y_train_dataset3)
model9.fit(X_train_dataset4, y_train_dataset4)
model10.fit(X_train_dataset5, y_train_dataset5)
LGBMClassifier()
model11 = catboost.CatBoostClassifier()
model12 = catboost.CatBoostClassifier()
model13 = catboost.CatBoostClassifier()
model14 = catboost.CatBoostClassifier()
model15 = catboost.CatBoostClassifier()
model11.fit(X_train_dataset1, y_train_dataset1)
model12.fit(X_train_dataset2, y_train_dataset2)
model13.fit(X_train_dataset3, y_train_dataset3)
model14.fit(X_train_dataset4, y_train_dataset4)
model15.fit(X_train_dataset5, y_train_dataset5)
Learning rate set to 0.06692 0: learn: 0.6759286 total: 66.1ms remaining: 1m 6s 1: learn: 0.6610865 total: 77.6ms remaining: 38.7s 2: learn: 0.6486346 total: 88.8ms remaining: 29.5s 3: learn: 0.6368109 total: 99.2ms remaining: 24.7s 4: learn: 0.6277035 total: 108ms remaining: 21.6s 5: learn: 0.6184388 total: 120ms remaining: 19.8s 6: learn: 0.6111359 total: 130ms remaining: 18.5s 7: learn: 0.6042963 total: 150ms remaining: 18.7s 8: learn: 0.5985637 total: 163ms remaining: 18s 9: learn: 0.5937757 total: 177ms remaining: 17.5s 991: learn: 0.4435676 total: 17.9s remaining: 144ms 992: learn: 0.4435011 total: 17.9s remaining: 126ms 993: learn: 0.4434435 total: 18s remaining: 108ms 994: learn: 0.4433911 total: 18s remaining: 90.3ms 995: learn: 0.4433295 total: 18s remaining: 72.2ms 996: learn: 0.4432811 total: 18s remaining: 54.2ms 997: learn: 0.4432344 total: 18s remaining: 36.1ms 998: learn: 0.4431728 total: 18.1s remaining: 18.1ms 999: learn: 0.4431009 total: 18.1s remaining: 0us Learning rate set to 0.06692 0: learn: 0.6751357 total: 16.8ms remaining: 16.8s 1: learn: 0.6608088 total: 27.9ms remaining: 13.9s 2: learn: 0.6477290 total: 40.3ms remaining: 13.4s 3: learn: 0.6361654 total: 53.2ms remaining: 13.3s 4: learn: 0.6251300 total: 65.5ms remaining: 13s 5: learn: 0.6157266 total: 82.4ms remaining: 13.7s 6: learn: 0.6074684 total: 98ms remaining: 13.9s 7: learn: 0.6013841 total: 112ms remaining: 13.9s 8: learn: 0.5958427 total: 128ms remaining: 14.1s 9: learn: 0.5910737 total: 149ms remaining: 14.7s 991: learn: 0.4405092 total: 16.2s remaining: 130ms 992: learn: 0.4404866 total: 16.2s remaining: 114ms 993: learn: 0.4404314 total: 16.2s remaining: 97.7ms 994: learn: 0.4404005 total: 16.2s remaining: 81.4ms 995: learn: 0.4403511 total: 16.2s remaining: 65.1ms 996: learn: 0.4403181 total: 16.2s remaining: 48.8ms 997: learn: 0.4402590 total: 16.2s remaining: 32.6ms 998: learn: 0.4401813 total: 16.3s remaining: 16.3ms 999: learn: 0.4401328 total: 16.3s remaining: 0us Learning rate set to 0.06692 0: learn: 0.6755781 total: 14.9ms remaining: 14.9s 1: learn: 0.6608919 total: 33.3ms remaining: 16.6s 2: learn: 0.6476158 total: 46.2ms remaining: 15.4s 3: learn: 0.6364682 total: 64.6ms remaining: 16.1s 4: learn: 0.6266676 total: 76.9ms remaining: 15.3s 5: learn: 0.6181876 total: 91.6ms remaining: 15.2s 6: learn: 0.6101398 total: 104ms remaining: 14.8s 7: learn: 0.6032476 total: 118ms remaining: 14.6s 8: learn: 0.5970564 total: 135ms remaining: 14.9s 9: learn: 0.5921347 total: 148ms remaining: 14.7s 991: learn: 0.4417088 total: 17.3s remaining: 139ms 992: learn: 0.4416633 total: 17.3s remaining: 122ms 993: learn: 0.4415901 total: 17.3s remaining: 105ms 994: learn: 0.4415245 total: 17.3s remaining: 87.1ms 995: learn: 0.4414774 total: 17.4s remaining: 69.7ms 996: learn: 0.4414339 total: 17.4s remaining: 52.3ms 997: learn: 0.4414069 total: 17.4s remaining: 34.9ms 998: learn: 0.4413760 total: 17.4s remaining: 17.4ms 999: learn: 0.4413206 total: 17.5s remaining: 0us Learning rate set to 0.06692 0: learn: 0.6759031 total: 17.7ms remaining: 17.7s 1: learn: 0.6600829 total: 36ms remaining: 18s 2: learn: 0.6472842 total: 54.7ms remaining: 18.2s 3: learn: 0.6368099 total: 69.3ms remaining: 17.3s 4: learn: 0.6275614 total: 85.9ms remaining: 17.1s 5: learn: 0.6191886 total: 103ms remaining: 17s 6: learn: 0.6113553 total: 118ms remaining: 16.8s 7: learn: 0.6043434 total: 133ms remaining: 16.5s 8: learn: 0.5986823 total: 155ms remaining: 17.1s 9: learn: 0.5940700 total: 184ms remaining: 18.2s 991: learn: 0.4424352 total: 20.3s remaining: 164ms 992: learn: 0.4424006 total: 20.3s remaining: 143ms 993: learn: 0.4423326 total: 20.4s remaining: 123ms 994: learn: 0.4422714 total: 20.4s remaining: 102ms 995: learn: 0.4422291 total: 20.4s remaining: 81.8ms 996: learn: 0.4421637 total: 20.4s remaining: 61.4ms 997: learn: 0.4421127 total: 20.4s remaining: 40.9ms 998: learn: 0.4420725 total: 20.4s remaining: 20.4ms 999: learn: 0.4420401 total: 20.4s remaining: 0us Learning rate set to 0.06692 0: learn: 0.6754843 total: 15.9ms remaining: 15.9s 1: learn: 0.6597966 total: 27.4ms remaining: 13.7s 2: learn: 0.6469985 total: 40.2ms remaining: 13.4s 3: learn: 0.6359868 total: 53.8ms remaining: 13.4s 4: learn: 0.6266373 total: 66.6ms remaining: 13.3s 5: learn: 0.6176368 total: 79.1ms remaining: 13.1s 6: learn: 0.6098391 total: 90.6ms remaining: 12.9s 7: learn: 0.6026141 total: 101ms remaining: 12.6s 8: learn: 0.5968272 total: 121ms remaining: 13.3s 9: learn: 0.5918948 total: 133ms remaining: 13.1s 991: learn: 0.4425485 total: 18.9s remaining: 152ms 992: learn: 0.4424977 total: 18.9s remaining: 133ms 993: learn: 0.4424367 total: 18.9s remaining: 114ms 994: learn: 0.4423775 total: 18.9s remaining: 95.2ms 995: learn: 0.4423239 total: 19s remaining: 76.1ms 996: learn: 0.4422579 total: 19s remaining: 57.1ms 997: learn: 0.4422072 total: 19s remaining: 38ms 998: learn: 0.4421429 total: 19s remaining: 19ms 999: learn: 0.4420807 total: 19s remaining: 0us
<catboost.core.CatBoostClassifier at 0x7fc76e585a60>
y_prob1 = model1.predict_proba(X_train5)
y_prob2 = model2.predict_proba(X_train4)
y_prob3 = model3.predict_proba(X_train3)
y_prob4 = model4.predict_proba(X_train2)
y_prob5 = model5.predict_proba(X_train1)
y_prob6 = model6.predict_proba(X_train5)
y_prob7 = model7.predict_proba(X_train4)
y_prob8 = model8.predict_proba(X_train3)
y_prob9 = model9.predict_proba(X_train2)
y_prob10 = model10.predict_proba(X_train1)
y_prob11 = model11.predict_proba(X_train5)
y_prob12 = model12.predict_proba(X_train4)
y_prob13 = model13.predict_proba(X_train3)
y_prob14 = model14.predict_proba(X_train2)
y_prob15 = model15.predict_proba(X_train1)
# thr 0.3보다 큰 경우
thr = 0.35
print(f"thr > {thr}")
pred1 = (y_prob1[:,1] >= thr).astype(np.int64)
pred2 = (y_prob2[:,1] >= thr).astype(np.int64)
pred3 = (y_prob3[:,1] >= thr).astype(np.int64)
pred4 = (y_prob4[:,1] >= thr).astype(np.int64)
pred5 = (y_prob5[:,1] >= thr).astype(np.int64)
pred6 = (y_prob6[:,1] >= thr).astype(np.int64)
pred7 = (y_prob7[:,1] >= thr).astype(np.int64)
pred8 = (y_prob8[:,1] >= thr).astype(np.int64)
pred9 = (y_prob9[:,1] >= thr).astype(np.int64)
pred10 = (y_prob10[:,1] >= thr).astype(np.int64)
pred11 = (y_prob11[:,1] >= thr).astype(np.int64)
pred12 = (y_prob12[:,1] >= thr).astype(np.int64)
pred13 = (y_prob13[:,1] >= thr).astype(np.int64)
pred14 = (y_prob14[:,1] >= thr).astype(np.int64)
pred15 = (y_prob15[:,1] >= thr).astype(np.int64)
print(f" model1의 예측비율 : {pred1.sum() / len(pred1)}")
print(f" model2의 예측비율 : {pred2.sum() / len(pred2)}")
print(f" model3의 예측비율 : {pred3.sum() / len(pred3)}")
print(f" model4의 예측비율 : {pred4.sum() / len(pred4)}")
print(f" model5의 예측비율 : {pred5.sum() / len(pred5)}")
print(f" model6의 예측비율 : {pred6.sum() / len(pred6)}")
print(f" model7의 예측비율 : {pred7.sum() / len(pred7)}")
print(f" model8의 예측비율 : {pred8.sum() / len(pred8)}")
print(f" model9의 예측비율 : {pred9.sum() / len(pred9)}")
print(f" model10의 예측비율 : {pred10.sum() / len(pred10)}")
print(f" model11의 예측비율 : {pred11.sum() / len(pred11)}")
print(f" model12의 예측비율 : {pred12.sum() / len(pred12)}")
print(f" model13의 예측비율 : {pred13.sum() / len(pred13)}")
print(f" model14의 예측비율 : {pred14.sum() / len(pred14)}")
print(f" model15의 예측비율 : {pred15.sum() / len(pred15)}")
thr > 0.35 model1의 예측비율 : 0.37745 model2의 예측비율 : 0.38155 model3의 예측비율 : 0.3768 model4의 예측비율 : 0.3799 model5의 예측비율 : 0.37465 model6의 예측비율 : 0.37305 model7의 예측비율 : 0.3825 model8의 예측비율 : 0.3756 model9의 예측비율 : 0.3755 model10의 예측비율 : 0.3704 model11의 예측비율 : 0.376 model12의 예측비율 : 0.38415 model13의 예측비율 : 0.37595 model14의 예측비율 : 0.37735 model15의 예측비율 : 0.37725
제출하기전 threshold 값을 적절히 조절하여, acu와 f1값이 높아지게 해주면 됩니다.
아래와 같이 함수로 작성해주어도 됩니다.
저는 함수를 사용하지 않았습니다!!
def calc_score_model(model, name, X_train, y_train, X_val, y_val):
model1 = model
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_val)
y_prob1 = model1.predict_proba(X_val)
thr_result = 0.35
max_val = 0.0
scale = 1000
for thr in range(1, scale):
val = calc_sum_f1_and_accuracy(y_val, (y_prob1[:,1] >= thr / scale))
if val > max_val:
thr_result = thr / scale
max_val = val
return [name, thr_result, max_val, model]
# function test
print(calc_score_model(XGBClassifier(), 'XGB', X_train_dataset1, y_train_dataset1, X_train5, y_train5))
def get_clf_eval(y_actual, y_pred, model):
# accuracy = accuracy_score(y_actual, y_pred)
# precision = precision_score(y_actual, y_pred)
# recall = recall_score(y_actual, y_pred)
AUC = roc_auc_score(y_actual, y_pred)
F1 = f1_score(y_actual, y_pred)
# print('\n정확도: {:.4f}'.format(accuracy))
# print('정밀도: {:.4f}'.format(precision))
# print('재현율: {:.4f}'.format(recall))
print(model)
print('AUC: {:.4f}'.format(AUC))
print('F1: {:.4f}'.format(F1))
print()
# xgboost 성능 확인
get_clf_eval(y_train5, pred1, "model 1")
get_clf_eval(y_train4, pred2, "model 2")
get_clf_eval(y_train3, pred3, "model 3")
get_clf_eval(y_train2, pred4, "model 4")
get_clf_eval(y_train1, pred5, "model 5")
model 1 AUC: 0.7240 F1: 0.6246 model 2 AUC: 0.7208 F1: 0.6301 model 3 AUC: 0.7230 F1: 0.6305 model 4 AUC: 0.7215 F1: 0.6245 model 5 AUC: 0.7156 F1: 0.6188
get_clf_eval(y_train5, pred6, "model 6")
get_clf_eval(y_train4, pred7, "model 7")
get_clf_eval(y_train3, pred8, "model 8")
get_clf_eval(y_train2, pred9, "model 9")
get_clf_eval(y_train1, pred10, "model 10")
model 6 AUC: 0.7262 F1: 0.6273 model 7 AUC: 0.7223 F1: 0.6320 model 8 AUC: 0.7249 F1: 0.6329 model 9 AUC: 0.7244 F1: 0.6280 model 10 AUC: 0.7223 F1: 0.6272
get_clf_eval(y_train5, pred11, "model 11")
get_clf_eval(y_train4, pred12, "model 12")
get_clf_eval(y_train3, pred13, "model 13")
get_clf_eval(y_train2, pred14, "model 14")
get_clf_eval(y_train1, pred15, "model 15")
model 11 AUC: 0.7296 F1: 0.6317 model 12 AUC: 0.7275 F1: 0.6386 model 13 AUC: 0.7277 F1: 0.6364 model 14 AUC: 0.7266 F1: 0.6308 model 15 AUC: 0.7257 F1: 0.6316
# 제출 양식 다운로드
submit = pd.read_csv('./data/sample_submission.csv')
# prediction 수행
df_test = pd.read_csv('./data/test.csv')
test = df_test.drop('ID', axis=1)
prob1 = model1.predict_proba(test)
out1 = (prob1[:,1] >= thr).astype(np.int64)
prob2 = model2.predict_proba(test)
out2 = (prob2[:,1] >= thr).astype(np.int64)
prob3 = model3.predict_proba(test)
out3 = (prob3[:,1] >= thr).astype(np.int64)
prob4 = model4.predict_proba(test)
out4 = (prob4[:,1] >= thr).astype(np.int64)
prob5 = model5.predict_proba(test)
out5 = (prob5[:,1] >= thr).astype(np.int64)
prob6 = model6.predict_proba(test)
out6 = (prob6[:,1] >= thr).astype(np.int64)
prob7 = model7.predict_proba(test)
out7 = (prob7[:,1] >= thr).astype(np.int64)
prob8 = model8.predict_proba(test)
out8 = (prob8[:,1] >= thr).astype(np.int64)
prob9 = model9.predict_proba(test)
out9 = (prob9[:,1] >= thr).astype(np.int64)
prob10 = model10.predict_proba(test)
out10 = (prob10[:,1] >= thr).astype(np.int64)
prob11 = model11.predict_proba(test)
out11 = (prob11[:,1] >= thr).astype(np.int64)
prob12 = model12.predict_proba(test)
out12 = (prob12[:,1] >= thr).astype(np.int64)
prob13 = model13.predict_proba(test)
out13 = (prob13[:,1] >= thr).astype(np.int64)
prob14 = model14.predict_proba(test)
out14 = (prob14[:,1] >= thr).astype(np.int64)
prob15 = model15.predict_proba(test)
out15 = (prob15[:,1] >= thr).astype(np.int64)
print(out1.sum() / len(out1))
print(out2.sum() / len(out2))
print(out3.sum() / len(out3))
print(out4.sum() / len(out4))
print(out5.sum() / len(out5))
print(out6.sum() / len(out6))
print(out7.sum() / len(out7))
print(out8.sum() / len(out8))
print(out9.sum() / len(out9))
print(out10.sum() / len(out10))
print(out11.sum() / len(out11))
print(out12.sum() / len(out12))
print(out13.sum() / len(out13))
print(out14.sum() / len(out14))
print(out15.sum() / len(out15))
0.3856097833370561 0.3853305785123967 0.385637703819522 0.38658700022336384 0.3859169086441814 0.38128210855483585 0.38172883627429083 0.381198347107438 0.38175675675675674 0.378992628992629 0.3858052267143176 0.3860285905740451 0.3832365423274514 0.3866428411882957 0.38418583873129325
result = []
for i in range(len(test)):
temp = 0
for j in range(out_list.shape[0]):
temp += out_list[j][i]
if 6 <= temp:
result.append(1)
else:
result.append(0)
print(f" 최종 결과물!!! ")
print(f" {sum(result) / len(result)} ")
print(len(result))
print(result.count(1))
submit["answer"] = result
최종 결과물!!! 0.40037971856153676 35816 14340
# 제출 파일 저장
submit.to_csv('./submit/submit.csv', index=False)
댓글남기기