import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, RidgeCV, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier

import sklearn.metrics as metrics
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

EDA

변수 설명

  • int_rate : 대출자에 부여된 이자율 (Interest rate of the loan the applicant received)

  • annual_inc : 연 소득 (annual income)

  • dti : 소득 대비 부채 비율 (Debt-to-income ratio)

  • delinq_2yrs : 지난 2년 간 체납 발생 횟수 (Delinquencies on lines of credit in the last 2 years)

  • inq_last_6mths : 지난 6개월 간 신용 조회 수 (Inquiries into the applicant’s credit during the last 6 months)

  • pub_rec : 파산 횟수 (Number of bankruptcies listed in the public record)

  • revol_bal : 리볼빙 잔액 (Total credit revolving balance)

  • total_acc : 지금까지 소유했던 신용카드 개수 (num_total_cc_accounts : Total number of credit card accounts in the applicant’s history)

  • collections_12_mths_ex_med : 의료부문을 제외한 지난 12개월 간 추심 발생 횟수 (num_collections_last_12m : Number of collections in the last 12 months. This excludes medical collections)

  • acc_now_delinq : 대출자가 체납 상태에 있지 않은 계좌의 수 (The number of accounts on which the borrower is now delinquent)

  • tot_coll_amt : 대출자에 대한 현재까지의 총 추심액 (total_collection_amount_ever : The total amount that the applicant has had against them in collections)

  • tot_cur_bal : 전 계좌의 현재 통합 잔고 (Total current balance of all accounts)

  • chargeoff_within_12_mths : 대출 부 신청인의 대출 신청 직전 12개월 간 세금 공제 횟수 (Number of charge-offs within last 12 months at time of application for the secondary applicant)

  • delinq_amnt : 체납 금액 (delinquency amount)

  • tax_liens : 세금 저당권의 수 (Number of tax liens)

  • emp_length1 ~ 12 : 고용 연수 (Number of years in the job)

  • home_ownership1 ~ 6 : 대출 신청자의 주거 소유 형태 (The ownership status of the applicant’s residence)

  • verification_status1 ~ 3 : 공동 소득 발생 여부 및 형태 (verification_income_joint : Type of verification of the joint income)

  • purpose1 ~ 14 : 대출 목적 (The purpose of the loan)

  • initial_list_status1 ~ 2 : 최초 대출 상태 (Initial listing status of the loan)

  • mths_since_last_delinq1 ~ 11 : 마지막 체납이 지금으로부터 몇개월 전에 있었는지를 나타내는 변수 (Months since the last delinquency)

  • funded_amnt : 대출액 (Funded amount)

  • funded_amnt_inv : 사채 대출액 (Funded amount by investors)

  • total_rec_late_fee : 총 연체료 중 납부액 (Late fees received to date)

  • term1 : 상환 기간 (The number of payments on the loan. Values are in months and can be either 36 or 60)

  • open_acc : 개설 개좌 수 (The number of open credit lines in the borrower’s credit file)

  • installment : 대출 발생 시 월 상환액 (The monthly payment owed by the borrower if the loan originates)

  • revol_util : 리볼빙 한도 대비 리볼빙 사용 비율 (Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit)

  • out_prncp : 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded)

  • out_prncp_inv : 사채 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded by investors)

  • total_rec_int : 이자 상환액 (Interest received to date)

  • fico_range_low : FICO(일종의 신용점수) 최저값 (The lower boundary range the borrower’s FICO at loan origination belongs to)

  • fico_range_high : FICO(일종의 신용점수) 최고값 (The upper boundary range the borrower’s FICO at loan origination belongs to)

  • depvar : 고객의 부도 여부 (dependent variable)

df = pd.read_csv('/content/drive/MyDrive/22-01-28/train.csv')
df.head()
int_rate annual_inc dti delinq_2yrs inq_last_6mths pub_rec revol_bal total_acc collections_12_mths_ex_med acc_now_delinq tot_coll_amt tot_cur_bal chargeoff_within_12_mths delinq_amnt tax_liens emp_length1 emp_length2 emp_length3 emp_length4 emp_length5 emp_length6 emp_length7 emp_length8 emp_length9 emp_length10 emp_length11 emp_length12 home_ownership1 home_ownership2 home_ownership3 home_ownership4 home_ownership5 home_ownership6 verification_status1 verification_status2 verification_status3 purpose1 purpose2 purpose3 purpose4 purpose5 purpose6 purpose7 purpose8 purpose9 purpose10 purpose11 purpose12 purpose13 purpose14 initial_list_status1 initial_list_status2 mths_since_last_delinq1 mths_since_last_delinq2 mths_since_last_delinq3 mths_since_last_delinq4 mths_since_last_delinq5 mths_since_last_delinq6 mths_since_last_delinq7 mths_since_last_delinq8 mths_since_last_delinq9 mths_since_last_delinq10 mths_since_last_delinq11 funded_amnt funded_amnt_inv total_rec_late_fee term1 open_acc installment revol_util out_prncp out_prncp_inv total_rec_int fico_range_low fico_range_high depvar
0 0.0824 21000.0 29.19 0 1 0 3016 26 0 0 0 11773 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1200 1200.0 0.0 1 18 37.74 0.076 0.0 0.0 157.94 765 769 0
1 0.1299 80000.0 4.82 0 1 1 5722 24 0 0 0 21875 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 8000 8000.0 0.0 1 8 269.52 0.447 0.0 0.0 1702.42 665 669 0
2 0.1299 38000.0 23.66 0 3 0 6511 18 0 0 0 31868 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 5000 5000.0 0.0 1 7 168.45 0.880 0.0 0.0 1066.64 670 674 0
3 0.1367 100000.0 16.27 4 2 0 6849 30 0 0 0 326049 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 15000 15000.0 0.0 1 12 510.27 0.457 0.0 0.0 1256.24 680 684 1
4 0.1269 30000.0 25.28 0 1 2 8197 12 0 0 2506 8840 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 10000 10000.0 0.0 1 8 335.45 0.416 0.0 0.0 871.04 660 664 1

기본적으로 MNC에서 제공한 데이터는 깔끔했다. Feature engineering도 잘했다.

(처음에는 데이터를 받았으니, Feature engineering을 한다고, 시간을 많이 보냈는데, 잘했다…)

df.describe()
int_rate annual_inc dti delinq_2yrs inq_last_6mths pub_rec revol_bal total_acc collections_12_mths_ex_med acc_now_delinq tot_coll_amt tot_cur_bal chargeoff_within_12_mths delinq_amnt tax_liens emp_length1 emp_length2 emp_length3 emp_length4 emp_length5 emp_length6 emp_length7 emp_length8 emp_length9 emp_length10 emp_length11 emp_length12 home_ownership1 home_ownership2 home_ownership3 home_ownership4 home_ownership5 home_ownership6 verification_status1 verification_status2 verification_status3 purpose1 purpose2 purpose3 purpose4 purpose5 purpose6 purpose7 purpose8 purpose9 purpose10 purpose11 purpose12 purpose13 purpose14 initial_list_status1 initial_list_status2 mths_since_last_delinq1 mths_since_last_delinq2 mths_since_last_delinq3 mths_since_last_delinq4 mths_since_last_delinq5 mths_since_last_delinq6 mths_since_last_delinq7 mths_since_last_delinq8 mths_since_last_delinq9 mths_since_last_delinq10 mths_since_last_delinq11 funded_amnt funded_amnt_inv total_rec_late_fee term1 open_acc installment revol_util out_prncp out_prncp_inv total_rec_int fico_range_low fico_range_high depvar
count 100000.000000 1.000000e+05 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 1.000000e+05 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.0 100000.000000 100000.000000 100000.000000 100000.00000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 1.000000e+05 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000 100000.000000
mean 0.130833 7.436061e+04 18.514508 0.343700 0.668880 0.235720 16090.202820 24.675910 0.018810 0.005800 250.588120 1.305281e+05 0.009630 19.288330 0.063820 0.065400 0.326240 0.089410 0.08058 0.059480 0.060710 0.046640 0.045320 0.045690 0.038570 0.075550 0.066410 0.000330 0.466120 0.000070 0.000050 0.110880 0.422550 0.321800 0.390200 0.288000 0.009620 0.226720 0.582810 0.0 0.060400 0.004250 0.019830 0.01101 0.007310 0.058200 0.000650 0.011250 0.007070 0.000880 0.434430 0.565570 0.489320 0.052360 0.060090 0.045700 0.050750 0.048560 0.051340 0.053430 0.048960 0.050030 0.049460 13735.317750 13729.341073 2.448885e+00 0.855920 11.620700 434.077648 0.537233 0.253327 0.253259 2491.282802 692.630550 696.630660 0.325690
std 0.044773 7.467409e+04 8.413049 0.905007 0.952044 0.661468 21569.939271 11.883834 0.150321 0.083585 2042.770881 1.503326e+05 0.110079 893.304366 0.468027 0.247232 0.468839 0.285336 0.27219 0.236522 0.238799 0.210868 0.208006 0.208813 0.192569 0.264278 0.248999 0.018163 0.498853 0.008366 0.007071 0.313985 0.493968 0.467169 0.487797 0.452833 0.097609 0.418712 0.493097 0.0 0.238228 0.065054 0.139417 0.10435 0.085186 0.234122 0.025487 0.105468 0.083786 0.029652 0.495684 0.495684 0.499888 0.222753 0.237655 0.208835 0.219488 0.214947 0.220691 0.224891 0.215786 0.218008 0.216828 8464.825314 8461.694483 1.489496e+01 0.351173 5.458774 265.921746 0.239373 18.053290 18.051746 2706.262200 29.668017 29.668584 0.468634
min 0.053200 5.360000e+03 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1000.000000 800.000000 -2.000000e-09 0.000000 1.000000 23.360000 0.000000 0.000000 0.000000 0.000000 660.000000 664.000000 0.000000
25% 0.097500 4.500000e+04 12.200000 0.000000 0.000000 0.000000 6009.000000 16.000000 0.000000 0.000000 0.000000 2.698900e+04 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 7200.000000 7200.000000 0.000000e+00 1.000000 8.000000 240.292500 0.361000 0.000000 0.000000 857.292500 670.000000 674.000000 0.000000
50% 0.127400 6.200000e+04 18.060000 0.000000 0.000000 0.000000 11030.500000 23.000000 0.000000 0.000000 0.000000 6.802350e+04 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.0 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 12000.000000 12000.000000 0.000000e+00 1.000000 11.000000 366.370000 0.541000 0.000000 0.000000 1615.160000 685.000000 689.000000 0.000000
75% 0.158000 9.000000e+04 24.530000 0.000000 1.000000 0.000000 19540.000000 31.000000 0.000000 0.000000 0.000000 1.943098e+05 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 1.000000 0.0 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 19200.000000 19200.000000 0.000000e+00 1.000000 14.000000 575.860000 0.720000 0.000000 0.000000 3039.115000 705.000000 709.000000 1.000000
max 0.309900 8.300000e+06 49.930000 20.000000 6.000000 63.000000 971736.000000 176.000000 5.000000 6.000000 197765.000000 3.164353e+06 7.000000 94521.000000 63.000000 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.0 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 40000.000000 40000.000000 8.741700e+02 1.000000 82.000000 1584.900000 8.923000 2330.970000 2330.970000 28005.960000 845.000000 850.000000 1.000000
# 결측치 확인
pd.DataFrame(df.isnull().sum()).rename(columns={0:'Non-Null Count'}).T
int_rate annual_inc dti delinq_2yrs inq_last_6mths pub_rec revol_bal total_acc collections_12_mths_ex_med acc_now_delinq tot_coll_amt tot_cur_bal chargeoff_within_12_mths delinq_amnt tax_liens emp_length1 emp_length2 emp_length3 emp_length4 emp_length5 emp_length6 emp_length7 emp_length8 emp_length9 emp_length10 emp_length11 emp_length12 home_ownership1 home_ownership2 home_ownership3 home_ownership4 home_ownership5 home_ownership6 verification_status1 verification_status2 verification_status3 purpose1 purpose2 purpose3 purpose4 purpose5 purpose6 purpose7 purpose8 purpose9 purpose10 purpose11 purpose12 purpose13 purpose14 initial_list_status1 initial_list_status2 mths_since_last_delinq1 mths_since_last_delinq2 mths_since_last_delinq3 mths_since_last_delinq4 mths_since_last_delinq5 mths_since_last_delinq6 mths_since_last_delinq7 mths_since_last_delinq8 mths_since_last_delinq9 mths_since_last_delinq10 mths_since_last_delinq11 funded_amnt funded_amnt_inv total_rec_late_fee term1 open_acc installment revol_util out_prncp out_prncp_inv total_rec_int fico_range_low fico_range_high depvar
Non-Null Count 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Feature engineering을 잘했다고 느껴진게 여기서 드러난다. 결측값(NaN)이 한개도 없었다.

아무래도 모델 튜닝에 신경쓰라는 의도인것 같았다.

# 라벨(depvar) 비율 확인
print('y=1 ratio :', df.depvar.sum()/len(df))
y=1 ratio : 0.32569

f1 score와 y=1 ratio, 마지막으로 macro f1

이 세 가지가 문제를 푸는데 정말 고민하게 했다… 끝에서 얘기를 하겠습니다.

Train Valid Split

어떤 방식으로 문제를 접근했냐면, XGBClassifier라는 모델을 선정했고,

이 모델을 학습시킬때, 총 100,000개의 데이터들을 5등분하여,

4개씩 트레이닝 하는 방법으로 진행하였습니다.

예를 들면,

dataset1, dataset2, dataset3, dataset4, dataset5 이렇게 각각 2만개의 데이터를 받고,

model1 = 1,2,3,4 (5번제외)

model2 = 1,2,3,5 (4번제외)

model3 = 1,2,4,5 (3번제외)

model4 = 1,3,4,5 (2번제외)

model5 = 2,3,4,5 (1번제외)

이런식으로 진행하여 최종 test_dataset에서는,

out1 = model1(test)

out2 = model2(test)

out3 = model3(test)

out4 = model4(test)

out5 = model5(test)

제출용 csv파일에는 5개의 모델중에서 동일한 class가 3개이상이면 해당 class로 제출하는방식을 택했습니다!!!!

(아직 못해본게 많이 있습니다. SMOTE를 통한 데이터 증식, RandomForest model을 통한 학습 등 고려할게 많았지만,

제가 시간을 삽질하는데에 허비해버려서, 대회 마감일에 허겁지겁 학습하게 되었습니다.

경험이 없어서, 모르는 용어가 많았네요 ㅜㅜ)

X = df.drop('depvar', axis=1)
y = df['depvar']
X_train1 = X[:20000]
X_train2 = X[20000:40000]
X_train3 = X[40000:60000]
X_train4 = X[60000:80000]
X_train5 = X[80000:]

y_train1 = y[:20000]
y_train2 = y[20000:40000]
y_train3 = y[40000:60000]
y_train4 = y[60000:80000]
y_train5 = y[80000:]
##########
frames = [X_train1, X_train2, X_train3, X_train4]
X_train_dataset1 = pd.concat(frames)

frames = [X_train1, X_train2, X_train3, X_train5]
X_train_dataset2 = pd.concat(frames)

frames = [X_train1, X_train2, X_train4, X_train5]
X_train_dataset3 = pd.concat(frames)

frames = [X_train1, X_train3, X_train4, X_train5]
X_train_dataset4 = pd.concat(frames)

frames = [X_train2, X_train3, X_train4, X_train5]
X_train_dataset5 = pd.concat(frames)
##########

frames = [y_train1, y_train2, y_train3, y_train4]
y_train_dataset1 = pd.concat(frames)

frames = [y_train1, y_train2, y_train3, y_train5]
y_train_dataset2 = pd.concat(frames)

frames = [y_train1, y_train2, y_train4, y_train5]
y_train_dataset3 = pd.concat(frames)

frames = [y_train1, y_train3, y_train4, y_train5]
y_train_dataset4 = pd.concat(frames)

frames = [y_train2, y_train3, y_train4, y_train5]
y_train_dataset5 = pd.concat(frames)
# training set과 validation set의 데이터 수 확인
print(f"X_train1 = {len(X_train1)}")
print(f"X_train2 = {len(X_train2)}")
print(f"X_train3 = {len(X_train3)}")
print(f"X_train4 = {len(X_train4)}")
print(f"X_train5 = {len(X_train5)}")
print()
print(f"y_train1 = {len(y_train1)}")
print(f"y_train2 = {len(y_train2)}")
print(f"y_train3 = {len(y_train3)}")
print(f"y_train4 = {len(y_train4)}")
print(f"y_train5 = {len(y_train5)}")
print()
print(f"y_train1의 label ratio check = {y_train1.sum() / len(y_train1)}")
print(f"y_train2의 label ratio check = {y_train2.sum() / len(y_train2)}")
print(f"y_train3의 label ratio check = {y_train3.sum() / len(y_train3)}")
print(f"y_train4의 label ratio check = {y_train4.sum() / len(y_train4)}")
print(f"y_train5의 label ratio check = {y_train5.sum() / len(y_train5)}")
X_train1 = 20000
X_train2 = 20000
X_train3 = 20000
X_train4 = 20000
X_train5 = 20000

y_train1 = 20000
y_train2 = 20000
y_train3 = 20000
y_train4 = 20000
y_train5 = 20000

y_train1의 label ratio check = 0.3256
y_train2의 label ratio check = 0.3226
y_train3의 label ratio check = 0.3293
y_train4의 label ratio check = 0.3329
y_train5의 label ratio check = 0.31805

중요한게 위에서 나왔군요..

제가 끝까지 고민하려 했지만, 한계를 느껴서, 이 포스팅을 마치고 공부하러 떠납니다…

(정말 모르는 용어가 너무 많고, 아직 많이 부족함을 느껴 공부를 더 해야합니다.)

끝에서도 말하겠지만, 제가 제출하려고 하는 최종 결과물의 핵심은 아래와 같습니다.

  1. train_dataset에서의 y=1 ratio 비율이 제출하려는 test_dataset에서의 비율과 거의 같아야 합니다.

  2. 이번 대회의 평가 지표인 macro f1의 값이 높아야 합니다.

  3. 당연한 소리이지만 올바른 class를 많이 맞추어야 합니다.

이게 너무 어려웠습니다.

아직 포스팅을 작성하고 있는 시점이라서, 해결은 못했지만…

어떤 현상이 벌어지냐면요..

fi score가 높으면, y=1 ratio의 값이 너무나도 튀게 됩니다.

ex) f1 score 0.666, y=1 ratio 0.5

f1 score가 precision과 recall에 영향을 받고,

Macro f1은 f1 score에 영향을 받습니다.

그러면 precision과 recall의 값이 높으면 되는데, 단순히 이 값만을 높이게 되면,

y=1 ratio가 튀게되어서 결과적으로 leaderboard에서도 낮은 점수를 기록하게 되었습니다.

(참고로 y=1 ratio가 튀게되어서 점수가 낮아진다고 추측하고있는것 뿐입니다.)

밑에서마저 얘기해보겠습니다.

X_train_dataset4
int_rate annual_inc dti delinq_2yrs inq_last_6mths pub_rec revol_bal total_acc collections_12_mths_ex_med acc_now_delinq tot_coll_amt tot_cur_bal chargeoff_within_12_mths delinq_amnt tax_liens emp_length1 emp_length2 emp_length3 emp_length4 emp_length5 emp_length6 emp_length7 emp_length8 emp_length9 emp_length10 emp_length11 emp_length12 home_ownership1 home_ownership2 home_ownership3 home_ownership4 home_ownership5 home_ownership6 verification_status1 verification_status2 verification_status3 purpose1 purpose2 purpose3 purpose4 purpose5 purpose6 purpose7 purpose8 purpose9 purpose10 purpose11 purpose12 purpose13 purpose14 initial_list_status1 initial_list_status2 mths_since_last_delinq1 mths_since_last_delinq2 mths_since_last_delinq3 mths_since_last_delinq4 mths_since_last_delinq5 mths_since_last_delinq6 mths_since_last_delinq7 mths_since_last_delinq8 mths_since_last_delinq9 mths_since_last_delinq10 mths_since_last_delinq11 funded_amnt funded_amnt_inv total_rec_late_fee term1 open_acc installment revol_util out_prncp out_prncp_inv total_rec_int fico_range_low fico_range_high
0 0.0824 21000.0 29.19 0 1 0 3016 26 0 0 0 11773 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1200 1200.0 0.0 1 18 37.74 0.076 0.0 0.0 157.94 765 769
1 0.1299 80000.0 4.82 0 1 1 5722 24 0 0 0 21875 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 8000 8000.0 0.0 1 8 269.52 0.447 0.0 0.0 1702.42 665 669
2 0.1299 38000.0 23.66 0 3 0 6511 18 0 0 0 31868 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 5000 5000.0 0.0 1 7 168.45 0.880 0.0 0.0 1066.64 670 674
3 0.1367 100000.0 16.27 4 2 0 6849 30 0 0 0 326049 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 15000 15000.0 0.0 1 12 510.27 0.457 0.0 0.0 1256.24 680 684
4 0.1269 30000.0 25.28 0 1 2 8197 12 0 0 2506 8840 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 10000 10000.0 0.0 1 8 335.45 0.416 0.0 0.0 871.04 660 664
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
99995 0.1757 65000.0 17.67 0 3 1 11255 21 1 0 0 26570 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 20000 20000.0 0.0 1 13 718.75 0.780 0.0 0.0 5373.29 660 664
99996 0.0890 65000.0 2.88 0 0 0 2105 12 0 0 0 6138 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 6000 6000.0 0.0 1 7 190.52 0.120 0.0 0.0 835.66 765 769
99997 0.1349 46000.0 32.12 0 1 0 8998 20 0 0 0 96531 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 6400 6400.0 0.0 1 19 217.16 0.643 0.0 0.0 1261.67 665 669
99998 0.2115 31000.0 4.53 0 1 0 3875 4 0 0 0 3875 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 5500 5500.0 0.0 1 3 207.64 0.731 0.0 0.0 1357.69 710 714
99999 0.1599 125000.0 33.33 0 0 0 34580 30 0 0 0 422626 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 33125 33125.0 0.0 1 19 1164.42 0.499 0.0 0.0 8882.58 690 694

80000 rows × 75 columns

Single Model(XGBoost)

XGBClassifier의 하이퍼 파라미터 설명

  • LINK : https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier

  • max_depth=3 : 디시전 트리의 최대 깊이

  • learning_rate=0.1 : 0과 1 사이의 값을 가지는 부스팅에 대한 학습률(eta). 매 부스팅 단계 이후 새로이 추가된 가중치는 이 파라미터로 조정된다. 이 값이 낮을수록 보수적이며, 수렴에 필요한 더 많은 디시전 트리가 필요하다.

  • n_estimators=100 : 라운드의 횟수 또는 부스팅된 디시전 트리의 개수

  • silent=True : 부스팅의 수행 동안 메시지를 출력할지에 대한 여부

  • objective=”reg:linear” : 모델이 학습할 작업의 분류, 미리 정의된 작업은 문자열로 지정이 가능하지만, 그렇지 않은 경우 콜러블callable을 만들어서 지정할 수도 있다.

  • booster=”gbtree” : ‘gbtree’, ‘gblinear’, ‘dart’일 수 있다. ‘dart’는 드롭아웃(dropout)이라는 특성을 추가한다(과적합 방지를 위해 무작위로 디시전 트리를 선택해 제거(드롭)한다). ‘gblinear’는 정규화된 선형 모델을 만든다(디시전 트리가 아니라 라소 회귀와 유사하다).

  • nthread=None : 더 이상 사용되지 않는다.

  • n_jobs : 사용할 스레드의 개수

  • gamma=0 : 노드 분할에 필요한 최소 손실 감소

  • min_child_weight=1 : 자식 노드 생성에 필요한 헤시안(hessian) 합의 최솟값

  • max_delta_step=0 : 보다 보수적으로 갱신을 수행하도록 만드는 값. 불균형 범주의 데이터셋에 대해서는 1부터 10까지의 값으로 설정한다.

  • subsample=1 : 부스팅에 사용할 샘플의 비율

  • colsample_bytree=1 : 부스팅에 사용할 특징 열의 비율

  • colsample_bylevel=1 : 각 디시전 트리의 수준별 사용할 특징 열의 비율

  • colsample_bynode=1 : 각 디시전 트리의 노드별 사용할 특징 열의 비율

  • reg_alpha=0 : L1 정규화(가중치의 평균). 이 값이 클수록 보수적이게 된다.

  • reg_lambda=1 : L2 정규화(가중치의 제곱근). 이 값이 클수록 보수적이게 된다.

  • base_score=.5 : 초기 편향치(bias)

  • seed=None : 더 이상 사용되지 않는다.

  • random_state=0 : 난수 생성 시드

  • missing=None : 누락된 데이터가 해석될 값. None은 np.nan을 의미한다.

  • importance_type=’gain’ : 특징 중요도의 유형. ‘gain’, ‘weight’, ‘cover’, ‘total_gain’, ‘total_cover’로 설정될 수 있다.

# # 하이퍼 파라미터 튜닝
# xgb_clf = xgboost.XGBClassifier()

# param_grid = {'max_depth':[5,10],
#               'n_estimators':[50, 100],
#               }

# cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

# grid_search = GridSearchCV(estimator=xgb_clf,
#                            param_grid=param_grid, 
#                            n_jobs=-1,
#                            cv=cv,
#                            scoring='accuracy', 
#                            error_score=0) 

# results=grid_search.fit(X_train_dataset1, y_train_dataset1)

# results.best_params_
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_valid = sc.transform(X_valid)

XGBoost hyperparameter tuning

hyperparameter tuning을 진행하면서, plot이 사라진게 있습니다.

시간이 오래걸려서 중간에 멈춰버린게 많았거든요.ㅜㅜ

그래도 값들은 직접 구해본것들입니다.!!!

이 부분은 “튜닝을 이런식으로 했구나” 정도만 보고 지나가셔도 될것같습니다.

tuning_dataset = df[:80000]
tuning_testset = df[80000:]
idx = [ i for i in range(80000)]
tuning_dataset["Id"] = idx
tuning_dataset
int_rate annual_inc dti delinq_2yrs inq_last_6mths pub_rec revol_bal total_acc collections_12_mths_ex_med acc_now_delinq tot_coll_amt tot_cur_bal chargeoff_within_12_mths delinq_amnt tax_liens emp_length1 emp_length2 emp_length3 emp_length4 emp_length5 emp_length6 emp_length7 emp_length8 emp_length9 emp_length10 emp_length11 emp_length12 home_ownership1 home_ownership2 home_ownership3 home_ownership4 home_ownership5 home_ownership6 verification_status1 verification_status2 verification_status3 purpose1 purpose2 purpose3 purpose4 purpose5 purpose6 purpose7 purpose8 purpose9 purpose10 purpose11 purpose12 purpose13 purpose14 initial_list_status1 initial_list_status2 mths_since_last_delinq1 mths_since_last_delinq2 mths_since_last_delinq3 mths_since_last_delinq4 mths_since_last_delinq5 mths_since_last_delinq6 mths_since_last_delinq7 mths_since_last_delinq8 mths_since_last_delinq9 mths_since_last_delinq10 mths_since_last_delinq11 funded_amnt funded_amnt_inv total_rec_late_fee term1 open_acc installment revol_util out_prncp out_prncp_inv total_rec_int fico_range_low fico_range_high depvar Id
0 0.0824 21000.0 29.19 0 1 0 3016 26 0 0 0 11773 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1200 1200.0 0.0 1 18 37.74 0.076 0.0 0.0 157.94 765 769 0 0
1 0.1299 80000.0 4.82 0 1 1 5722 24 0 0 0 21875 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 8000 8000.0 0.0 1 8 269.52 0.447 0.0 0.0 1702.42 665 669 0 1
2 0.1299 38000.0 23.66 0 3 0 6511 18 0 0 0 31868 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 5000 5000.0 0.0 1 7 168.45 0.880 0.0 0.0 1066.64 670 674 0 2
3 0.1367 100000.0 16.27 4 2 0 6849 30 0 0 0 326049 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 15000 15000.0 0.0 1 12 510.27 0.457 0.0 0.0 1256.24 680 684 1 3
4 0.1269 30000.0 25.28 0 1 2 8197 12 0 0 2506 8840 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 10000 10000.0 0.0 1 8 335.45 0.416 0.0 0.0 871.04 660 664 1 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
79995 0.1333 114480.0 7.81 0 1 1 5680 42 0 0 108 340793 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 10800 10800.0 0.0 1 18 365.62 0.458 0.0 0.0 1868.28 670 674 0 79995
79996 0.0699 88800.0 7.91 0 0 0 9617 11 0 0 0 230979 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 9800 9800.0 0.0 1 4 302.56 0.641 0.0 0.0 0.00 725 729 1 79996
79997 0.1299 32000.0 5.96 0 1 0 952 6 0 0 0 4410 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 5600 5600.0 0.0 1 4 188.66 0.090 0.0 0.0 1188.09 745 749 0 79997
79998 0.1333 60000.0 18.58 0 0 0 31287 14 0 0 0 35108 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 27600 27600.0 0.0 1 7 934.35 0.711 0.0 0.0 6015.99 705 709 0 79998
79999 0.2099 40000.0 22.92 0 0 0 2777 22 0 0 0 23758 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 11125 11125.0 0.0 0 10 300.91 0.237 0.0 0.0 6743.82 705 709 0 79999

80000 rows × 77 columns

def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=100):
   
    # get new n_estimator
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgboost.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='error', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
        print(alg)
    
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['depvar'], eval_metric='error')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Training Accuracy : %.4g" % metrics.accuracy_score(dtrain['depvar'].values, dtrain_predictions))
target = 'depvar'
IDcol = 'Id'
# n_estimators를 1000으로 두면 왜 training accuracy가 더 떨어질까? 100보다.
predictors = [x for x in tuning_dataset.columns if x not in [target, IDcol]]
model1 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 1,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205                                
                                )
modelfit(model1, tuning_dataset, predictors)

n_estimators fix


1.0.2
## GPU 쓰고 싶다.
# tree_method='gpu_hist',
# predictor='gpu_predictor'


cv_scores = list()
estimator_list = [i for i in range(100, 1000, 50)]
for i in tqdm(range(100, 1000, 50)):
  xgbc = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = i,
                                max_depth = 5,
                                min_child_weight = 1,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor'
                                )
  score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="f1_macro").mean()
  cv_scores.append(score)

  best_e = [estimator_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]
plt.figure(figsize=(20,10))
plt.plot(estimator_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)
print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 350
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")

Max depth


cv_scores = list()
max_depth_list = [5, 7, 9]
for i in tqdm(max_depth_list):
  xgbc = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 350,
                                max_depth = i,
                                min_child_weight = 1,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')
  score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="f1_macro").mean()
  cv_scores.append(score)

  best_e = [max_depth_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]
plt.figure(figsize=(20,10))
plt.plot(max_depth_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)
print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 9
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")

gamma


cv_scores = list()
gamma_list = [0.0, 0.1, 0.2, 0.3, 0.4]
for i in tqdm(gamma_list):
  xgbc = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 350,
                                max_depth = 9,
                                min_child_weight = 1,
                                gamma = i,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                eval_metric='error',
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')
  score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="accuracy").mean()
  cv_scores.append(score)

  best_e = [gamma_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]
100%|██████████| 5/5 [04:19<00:00, 51.92s/it]
plt.figure(figsize=(20,10))
plt.plot(gamma_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)
<matplotlib.lines.Line2D at 0x7f5c64528c90>

print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")
optimizer tree count : 0 
Train set에 대한 성능 : 0.7508 

min_child_weight


cv_scores = list()
min_child_weight_list = [i for i in range(1,10,2)]
for i in tqdm(range(1,10,2)):
  xgbc = xgboost.XGBClassifier(learning_rate = 0.1,
                                n_estimators = 350,
                                max_depth = 5,
                                min_child_weight = i,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                eval_metric='error',
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')
  score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="accuracy").mean()
  cv_scores.append(score)

  best_e = [min_child_weight_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]
100%|██████████| 5/5 [01:20<00:00, 16.01s/it]
plt.figure(figsize=(20,10))
plt.plot(min_child_weight_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)
<matplotlib.lines.Line2D at 0x7f5c4b3b71d0>

print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 1
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")
optimizer tree count : 0 
Train set에 대한 성능 : 0.7541 

XGBoost Model Training

위에서 얻어진 값들은 모델의 학습 파라미터로 주고 있습니다.


tree_method='gpu_hist',

predictor='gpu_predictor'

이 부분은 gpu로 학습시킨다는 의미이며, gpu가 없을시 저 부분을 삭제하고 진행하시면 됩니다.

Train Valid Split 에서 설명한대로 학습을 진행하고 있습니다.

model1 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 350,
                                max_depth = 9,
                                min_child_weight = 1,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')


model2 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 10,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')

model3 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 10,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')

model4 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 10,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')

model5 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 10,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')

model1.fit(X_train_dataset1, y_train_dataset1)
model2.fit(X_train_dataset2, y_train_dataset2)
model3.fit(X_train_dataset3, y_train_dataset3)
model4.fit(X_train_dataset4, y_train_dataset4)
model5.fit(X_train_dataset5, y_train_dataset5)

XGBClassifier(colsample_bytree=0.8, learning_rate=0.01, max_depth=5,
              min_child_weight=10, n_estimators=1000, nthread=-1,
              predictor='gpu_predictor', seed=220205, subsample=0.8,
              tree_method='gpu_hist')
y_pred1 = model1.predict(X_train5)
y_pred2 = model2.predict(X_train4)
y_pred3 = model3.predict(X_train3)
y_pred4 = model4.predict(X_train2)
y_pred5 = model5.predict(X_train1)

# y_valid가 0 또는 1일 확률 출력
y_prob1 = model1.predict_proba(X_train5)
y_prob2 = model2.predict_proba(X_train4)
y_prob3 = model3.predict_proba(X_train3)
y_prob4 = model4.predict_proba(X_train2)
y_prob5 = model5.predict_proba(X_train1)

# 1로 예측된 y_valid 갯수 및 비율 출력
print("thr > 0.5")
print(f" model1의 예측비율 : {y_pred1.sum() / len(y_pred1)}")
print(f" model2의 예측비율 : {y_pred2.sum() / len(y_pred2)}")
print(f" model3의 예측비율 : {y_pred3.sum() / len(y_pred3)}")
print(f" model4의 예측비율 : {y_pred4.sum() / len(y_pred4)}")
print(f" model5의 예측비율 : {y_pred5.sum() / len(y_pred5)}")

# thr 0.3보다 큰 경우
thr = 0.38
print(f"thr > {thr}")
pred1 = (y_prob1[:,1] >= thr).astype(np.int64)
pred2 = (y_prob2[:,1] >= thr).astype(np.int64)
pred3 = (y_prob3[:,1] >= thr).astype(np.int64)
pred4 = (y_prob4[:,1] >= thr).astype(np.int64)
pred5 = (y_prob5[:,1] >= thr).astype(np.int64)
print(f" model1의 예측비율 : {pred1.sum() / len(pred1)}")
print(f" model1의 예측비율 : {pred2.sum() / len(pred2)}")
print(f" model1의 예측비율 : {pred3.sum() / len(pred3)}")
print(f" model1의 예측비율 : {pred4.sum() / len(pred4)}")
print(f" model1의 예측비율 : {pred5.sum() / len(pred5)}")
thr > 0.5
 model1의 예측비율 : 0.2039
 model2의 예측비율 : 0.2112
 model3의 예측비율 : 0.2054
 model4의 예측비율 : 0.21025
 model5의 예측비율 : 0.2036
thr > 0.38
 model1의 예측비율 : 0.3277
 model1의 예측비율 : 0.3358
 model1의 예측비율 : 0.32705
 model1의 예측비율 : 0.33205
 model1의 예측비율 : 0.32595

여기에서 보면 predict()로 구하게 되면 thr값이 0.5가 default여서 y=1 ratio의 값이 train_dataset과 차이가 많이 나게 됩니다.

그래서 predict_proba()로 구해서 thr의 값을 수정해주는 작업을 진행해 주어야합니다.

사실 thr 값을 제가 이렇게 바꾸는게, 기껏 모델 학습시켜놓은것을 망친다는 생각도 했으며,

모델을 학습시킬대 thr값을 제가 변경해주면서 학습시키는것이 가능한가?

이 부분에 대한 공부도 필요합니다.

docs도 뒤져보고 있는데, 능력부족으로 파악하지 못했습니다.

직접 구현해야 하는것인지 ㅜㅜ

# 평가 함수 정의
def get_clf_eval(y_actual, y_pred):
    accuracy = accuracy_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred)
    recall = recall_score(y_actual, y_pred)
    AUC = roc_auc_score(y_actual, y_pred)
    F1 = f1_score(y_actual, y_pred)
    print('\naccuracy: {:.4f}'.format(accuracy))
    print('precision: {:.4f}'.format(precision))
    print('recall: {:.4f}'.format(recall))
    print('AUC: {:.4f}'.format(AUC))
    print('F1: {:.4f}'.format(F1))
    
    # sns.heatmap(confusion_matrix(y_actual, y_pred), annot=True, fmt='d', cmap='YlGnBu')
# xgboost 성능 확인
get_clf_eval(y_train5, pred1)
get_clf_eval(y_train4, pred2)
get_clf_eval(y_train3, pred3)
get_clf_eval(y_train2, pred4)
get_clf_eval(y_train1, pred5)

accuracy: 0.7500
precision: 0.6038
recall: 0.6221
AUC: 0.7158
F1: 0.6128

accuracy: 0.7428
precision: 0.6127
recall: 0.6181
AUC: 0.7116
F1: 0.6154

accuracy: 0.7459
precision: 0.6149
recall: 0.6107
AUC: 0.7115
F1: 0.6128

accuracy: 0.7456
precision: 0.6026
recall: 0.6203
AUC: 0.7127
F1: 0.6113

accuracy: 0.7435
precision: 0.6059
recall: 0.6066
AUC: 0.7081
F1: 0.6062

만족스러운 결과물은 아닙니다…

MNIST나 sklearn에서 제공하는 다른 데이터들에서는 항상 학습시키면 90이라는 숫자가 넘게 나왔는데,

train_dataset에서 학습을 시키는데도 불구하고 80을 못넘겼습니다…

test에서는 얼마나 더 떨어질지 ㅠㅠ

Randomforest hyperparameter tuning



Randomforest Model Training

Submission

# 제출 양식 다운로드
submit = pd.read_csv('/content/drive/MyDrive/22-01-28/sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('/content/drive/MyDrive/22-01-28/test.csv')
test = df_test.drop('ID', axis=1)
test
int_rate annual_inc dti delinq_2yrs inq_last_6mths pub_rec revol_bal total_acc collections_12_mths_ex_med acc_now_delinq tot_coll_amt tot_cur_bal chargeoff_within_12_mths delinq_amnt tax_liens emp_length1 emp_length2 emp_length3 emp_length4 emp_length5 emp_length6 emp_length7 emp_length8 emp_length9 emp_length10 emp_length11 emp_length12 home_ownership1 home_ownership2 home_ownership3 home_ownership4 home_ownership5 home_ownership6 verification_status1 verification_status2 verification_status3 purpose1 purpose2 purpose3 purpose4 purpose5 purpose6 purpose7 purpose8 purpose9 purpose10 purpose11 purpose12 purpose13 purpose14 initial_list_status1 initial_list_status2 mths_since_last_delinq1 mths_since_last_delinq2 mths_since_last_delinq3 mths_since_last_delinq4 mths_since_last_delinq5 mths_since_last_delinq6 mths_since_last_delinq7 mths_since_last_delinq8 mths_since_last_delinq9 mths_since_last_delinq10 mths_since_last_delinq11 funded_amnt funded_amnt_inv total_rec_late_fee term1 open_acc installment revol_util out_prncp out_prncp_inv total_rec_int fico_range_low fico_range_high
0 0.1449 16380.0 26.08 0 0 1 3486 10 0 0 0 9214 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 8100 8100.0 0.00 1 4 278.78 0.311 0.0 0.0 460.40 700 704
1 0.1899 65000.0 13.97 0 0 0 25305 20 0 0 0 115612 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 20000 20000.0 0.00 0 10 518.71 0.885 0.0 0.0 4866.68 675 679
2 0.1049 53000.0 23.28 0 0 0 10910 21 0 0 0 33017 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 10000 10000.0 16.25 1 7 324.98 0.580 0.0 0.0 1451.06 675 679
3 0.1757 71800.0 30.32 0 0 0 42423 26 0 0 0 152515 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 27200 27200.0 0.00 0 16 684.36 0.701 0.0 0.0 7068.11 665 669
4 0.2020 50000.0 25.61 0 2 0 21703 24 0 0 0 135282 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 22000 22000.0 0.00 0 13 585.32 0.622 0.0 0.0 7754.20 710 714
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
35811 0.1049 110000.0 9.02 0 0 1 8991 33 0 0 0 17468 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 12000 12000.0 0.00 1 13 389.98 0.346 0.0 0.0 1406.13 665 669
35812 0.0824 45000.0 32.56 1 0 0 20966 29 0 0 0 24802 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 5000 5000.0 0.00 1 9 157.24 0.690 0.0 0.0 495.70 705 709
35813 0.1368 49000.0 17.60 0 0 0 5597 7 0 0 0 20691 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 6000 6000.0 0.00 1 7 204.14 0.333 0.0 0.0 1355.06 705 709
35814 0.1139 84852.0 9.96 0 0 1 7184 13 0 0 0 31396 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 10000 10000.0 0.00 1 8 329.24 0.352 0.0 0.0 1615.15 725 729
35815 0.1274 60000.0 3.64 0 0 0 77 7 0 0 1468 178885 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1500 1500.0 0.00 1 3 50.36 0.770 0.0 0.0 311.08 675 679

35816 rows × 75 columns

# out1 = model1.predict(test)
# out2 = model2.predict(test)
# out3 = model3.predict(test)
# out4 = model4.predict(test)
# out5 = model5.predict(test)


prob1 = model1.predict_proba(test)
out1 = (prob1[:,1] >= thr).astype(np.int64)

prob2 = model2.predict_proba(test)
out2 = (prob2[:,1] >= thr).astype(np.int64)

prob3 = model3.predict_proba(test)
out3 = (prob3[:,1] >= thr).astype(np.int64)

prob4 = model4.predict_proba(test)
out4 = (prob4[:,1] >= thr).astype(np.int64)

prob5 = model5.predict_proba(test)
out5 = (prob5[:,1] >= thr).astype(np.int64)




print(out1.sum() / len(out1))
print(out2.sum() / len(out2))
print(out3.sum() / len(out3))
print(out4.sum() / len(out4))
print(out5.sum() / len(out5))






0.3358554835827563
0.334682823319187
0.332449184721912
0.3370002233638597
0.3345432209068573
result = []
for i in range(len(test)):
  if 3 == (out1[i] + out2[i] + out3[i] + out4[i] + out5[i]):
    result.append(1)
  elif 4 == (out1[i] + out2[i] + out3[i] + out4[i] + out5[i]):
    result.append(1)
  elif 5 == (out1[i] + out2[i] + out3[i] + out4[i] + out5[i]):
    result.append(1)
  else:
    result.append(0)

    
  
print(f" 최종 결과물!!! ")
print(f" {sum(result) / len(result)} ")
print(len(result))
print(result.count(1))

submit["answer"] = result  
 최종 결과물!!! 
 0.3345153004243913 
35816
11981
# 제출 파일 저장
submit.to_csv('/content/drive/MyDrive/22-02-05/2022-02-05-14.csv', index=False)

y=1 ratio가 우선순위 1번이며,

f1 score가 우선순위 2번으로,

모델을 수정하고, 학습시켰습니다.

대회 마감일날, 최종 제출할때도 위의 2개의 원칙을 지켜서 제출해보려 합니다.(공부를 해서 뭔가를 알아내기 전까지는 그대로일듯 합니다.)

아래의 주석들은 제가 제출한 파일들의 점수와 hyperparameter들입니다.

끝으로..

공부할게 참 많습니다.!!!!

##########################################3
# CSV file 설명
# 2022-02-05-1 : hyperparameter tuning 하지 않았고, predict(thrhold:0.5)로만 하였음.
# 2022-02-05-2 : hyperparameter tuning 하지 않았고, predict(thrhold:0.42)로만 하였음.
# 2022-02-05-3 : hyperparameter tuning 한 결과, predict(thrhold:0.42)로만 하였음.(어떻게 튜닝했는데 더 떨어지지?)
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-4 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로만 하였음.(최고기록: 0.717169)
# 2022-02-05-5 : hyperparameter tuning 한 결과, predict(thrhold:0.35)로 하였음. 0.35로 했더니 ratio of label는 망가졌으나(0.39), fi score가 높음! 0.637~0.638
# 2022-02-05-6 : hyperparameter tuning 한 결과, predict(thrhold:0.35)로 하였음.
# score는 713982
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 1000,
#                                 max_depth = 5,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 eval_metric='error',
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-7 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로 하였음.
# score는 713330
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 1000,
#                                 max_depth = 5,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 eval_metric='error',
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-8 : hyperparameter tuning 한 결과, predict(thrhold:0.4)로 하였음.
# score는 713574
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 eval_metric='error',
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-9 : hyperparameter tuning 한 결과, predict(thrhold:0.405)로 하였음.
# score는 712255
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-10 : hyperparameter tuning 한 결과, predict(thrhold:0.4)로 하였음.
# score는 713574
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-11 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로 하였음.
# score는 716175
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-12 : hyperparameter tuning 한 결과, predict(thrhold:0.36)로 하였음.
# score는 716011
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-13 : hyperparameter tuning 한 결과, predict(thrhold:0.38-0.4)로 하였음.
# score는 713264
# model1만 다르게 해본거.
# 2022-02-05-14 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로 하였음.
# score는 713171
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 1000,
#                                 max_depth = 5,
#                                 min_child_weight = 10,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')

댓글남기기