[MNC] 대출자 채무 불이행 여부 예측 모델

February 5, 2022

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, RidgeCV, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier

import sklearn.metrics as metrics
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

EDA

변수 설명

int_rate : 대출자에 부여된 이자율 (Interest rate of the loan the applicant received)
annual_inc : 연 소득 (annual income)
dti : 소득 대비 부채 비율 (Debt-to-income ratio)
delinq_2yrs : 지난 2년 간 체납 발생 횟수 (Delinquencies on lines of credit in the last 2 years)
inq_last_6mths : 지난 6개월 간 신용 조회 수 (Inquiries into the applicant’s credit during the last 6 months)
pub_rec : 파산 횟수 (Number of bankruptcies listed in the public record)
revol_bal : 리볼빙 잔액 (Total credit revolving balance)
total_acc : 지금까지 소유했던 신용카드 개수 (num_total_cc_accounts : Total number of credit card accounts in the applicant’s history)
collections_12_mths_ex_med : 의료부문을 제외한 지난 12개월 간 추심 발생 횟수 (num_collections_last_12m : Number of collections in the last 12 months. This excludes medical collections)
acc_now_delinq : 대출자가 체납 상태에 있지 않은 계좌의 수 (The number of accounts on which the borrower is now delinquent)
tot_coll_amt : 대출자에 대한 현재까지의 총 추심액 (total_collection_amount_ever : The total amount that the applicant has had against them in collections)
tot_cur_bal : 전 계좌의 현재 통합 잔고 (Total current balance of all accounts)
chargeoff_within_12_mths : 대출 부 신청인의 대출 신청 직전 12개월 간 세금 공제 횟수 (Number of charge-offs within last 12 months at time of application for the secondary applicant)
delinq_amnt : 체납 금액 (delinquency amount)
tax_liens : 세금 저당권의 수 (Number of tax liens)
emp_length1 ~ 12 : 고용 연수 (Number of years in the job)
home_ownership1 ~ 6 : 대출 신청자의 주거 소유 형태 (The ownership status of the applicant’s residence)
verification_status1 ~ 3 : 공동 소득 발생 여부 및 형태 (verification_income_joint : Type of verification of the joint income)
purpose1 ~ 14 : 대출 목적 (The purpose of the loan)
initial_list_status1 ~ 2 : 최초 대출 상태 (Initial listing status of the loan)
mths_since_last_delinq1 ~ 11 : 마지막 체납이 지금으로부터 몇개월 전에 있었는지를 나타내는 변수 (Months since the last delinquency)
funded_amnt : 대출액 (Funded amount)
funded_amnt_inv : 사채 대출액 (Funded amount by investors)
total_rec_late_fee : 총 연체료 중 납부액 (Late fees received to date)
term1 : 상환 기간 (The number of payments on the loan. Values are in months and can be either 36 or 60)
open_acc : 개설 개좌 수 (The number of open credit lines in the borrower’s credit file)
installment : 대출 발생 시 월 상환액 (The monthly payment owed by the borrower if the loan originates)
revol_util : 리볼빙 한도 대비 리볼빙 사용 비율 (Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit)
out_prncp : 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded)
out_prncp_inv : 사채 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded by investors)
total_rec_int : 이자 상환액 (Interest received to date)
fico_range_low : FICO(일종의 신용점수) 최저값 (The lower boundary range the borrower’s FICO at loan origination belongs to)
fico_range_high : FICO(일종의 신용점수) 최고값 (The upper boundary range the borrower’s FICO at loan origination belongs to)
depvar : 고객의 부도 여부 (dependent variable)

df = pd.read_csv('/content/drive/MyDrive/22-01-28/train.csv')
df.head()

	int_rate	annual_inc	dti	delinq_2yrs	inq_last_6mths	pub_rec	revol_bal	total_acc	tot_coll_amt	tot_cur_bal	emp_length1	emp_length2	emp_length3	emp_length4	home_ownership2	home_ownership6	verification_status2	verification_status3	purpose2	purpose3	initial_list_status2	mths_since_last_delinq1	mths_since_last_delinq5	mths_since_last_delinq11	funded_amnt	funded_amnt_inv	term1	open_acc	installment	revol_util	total_rec_int	fico_range_low	fico_range_high	depvar
0	0.0824	21000.0	29.19	0	1	0	3016	26	0	11773	1	0	0	0	0	1	1	0	1	0	1	1	0	0	1200	1200.0	1	18	37.74	0.076	157.94	765	769	0
1	0.1299	80000.0	4.82	0	1	1	5722	24	0	21875	0	1	0	0	0	1	0	1	0	1	1	1	0	0	8000	8000.0	1	8	269.52	0.447	1702.42	665	669	0
2	0.1299	38000.0	23.66	0	3	0	6511	18	0	31868	0	0	0	1	0	1	1	0	0	1	1	0	0	1	5000	5000.0	1	7	168.45	0.880	1066.64	670	674	0
3	0.1367	100000.0	16.27	4	2	0	6849	30	0	326049	0	0	1	0	1	0	0	1	0	1	1	0	1	0	15000	15000.0	1	12	510.27	0.457	1256.24	680	684	1
4	0.1269	30000.0	25.28	0	1	2	8197	12	2506	8840	0	0	0	1	0	1	1	0	0	1	1	1	0	0	10000	10000.0	1	8	335.45	0.416	871.04	660	664	1

기본적으로 MNC에서 제공한 데이터는 깔끔했다. Feature engineering도 잘했다.

(처음에는 데이터를 받았으니, Feature engineering을 한다고, 시간을 많이 보냈는데, 잘했다…)

df.describe()

	int_rate	annual_inc	dti	delinq_2yrs	inq_last_6mths	pub_rec	revol_bal	total_acc	collections_12_mths_ex_med	acc_now_delinq	tot_coll_amt	tot_cur_bal	chargeoff_within_12_mths	delinq_amnt	tax_liens	emp_length1	emp_length2	emp_length3	emp_length4	emp_length5	emp_length6	emp_length7	emp_length8	emp_length9	emp_length10	emp_length11	emp_length12	home_ownership1	home_ownership2	home_ownership3	home_ownership4	home_ownership5	home_ownership6	verification_status1	verification_status2	verification_status3	purpose1	purpose2	purpose3	purpose4	purpose5	purpose6	purpose7	purpose8	purpose9	purpose10	purpose11	purpose12	purpose13	purpose14	initial_list_status1	initial_list_status2	mths_since_last_delinq1	mths_since_last_delinq2	mths_since_last_delinq3	mths_since_last_delinq4	mths_since_last_delinq5	mths_since_last_delinq6	mths_since_last_delinq7	mths_since_last_delinq8	mths_since_last_delinq9	mths_since_last_delinq10	mths_since_last_delinq11	funded_amnt	funded_amnt_inv	total_rec_late_fee	term1	open_acc	installment	revol_util	out_prncp	out_prncp_inv	total_rec_int	fico_range_low	fico_range_high	depvar
count	100000.000000	1.000000e+05	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	1.000000e+05	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.00000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.0	100000.000000	100000.000000	100000.000000	100000.00000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	1.000000e+05	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000	100000.000000
mean	0.130833	7.436061e+04	18.514508	0.343700	0.668880	0.235720	16090.202820	24.675910	0.018810	0.005800	250.588120	1.305281e+05	0.009630	19.288330	0.063820	0.065400	0.326240	0.089410	0.08058	0.059480	0.060710	0.046640	0.045320	0.045690	0.038570	0.075550	0.066410	0.000330	0.466120	0.000070	0.000050	0.110880	0.422550	0.321800	0.390200	0.288000	0.009620	0.226720	0.582810	0.0	0.060400	0.004250	0.019830	0.01101	0.007310	0.058200	0.000650	0.011250	0.007070	0.000880	0.434430	0.565570	0.489320	0.052360	0.060090	0.045700	0.050750	0.048560	0.051340	0.053430	0.048960	0.050030	0.049460	13735.317750	13729.341073	2.448885e+00	0.855920	11.620700	434.077648	0.537233	0.253327	0.253259	2491.282802	692.630550	696.630660	0.325690
std	0.044773	7.467409e+04	8.413049	0.905007	0.952044	0.661468	21569.939271	11.883834	0.150321	0.083585	2042.770881	1.503326e+05	0.110079	893.304366	0.468027	0.247232	0.468839	0.285336	0.27219	0.236522	0.238799	0.210868	0.208006	0.208813	0.192569	0.264278	0.248999	0.018163	0.498853	0.008366	0.007071	0.313985	0.493968	0.467169	0.487797	0.452833	0.097609	0.418712	0.493097	0.0	0.238228	0.065054	0.139417	0.10435	0.085186	0.234122	0.025487	0.105468	0.083786	0.029652	0.495684	0.495684	0.499888	0.222753	0.237655	0.208835	0.219488	0.214947	0.220691	0.224891	0.215786	0.218008	0.216828	8464.825314	8461.694483	1.489496e+01	0.351173	5.458774	265.921746	0.239373	18.053290	18.051746	2706.262200	29.668017	29.668584	0.468634
min	0.053200	5.360000e+03	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	0.000000	0.000000	0.000000	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1000.000000	800.000000	-2.000000e-09	0.000000	1.000000	23.360000	0.000000	0.000000	0.000000	0.000000	660.000000	664.000000	0.000000
25%	0.097500	4.500000e+04	12.200000	0.000000	0.000000	0.000000	6009.000000	16.000000	0.000000	0.000000	0.000000	2.698900e+04	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	7200.000000	7200.000000	0.000000e+00	1.000000	8.000000	240.292500	0.361000	0.000000	0.000000	857.292500	670.000000	674.000000	0.000000
50%	0.127400	6.200000e+04	18.060000	0.000000	0.000000	0.000000	11030.500000	23.000000	0.000000	0.000000	0.000000	6.802350e+04	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	12000.000000	12000.000000	0.000000e+00	1.000000	11.000000	366.370000	0.541000	0.000000	0.000000	1615.160000	685.000000	689.000000	0.000000
75%	0.158000	9.000000e+04	24.530000	0.000000	1.000000	0.000000	19540.000000	31.000000	0.000000	0.000000	0.000000	1.943098e+05	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	1.000000	0.0	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	19200.000000	19200.000000	0.000000e+00	1.000000	14.000000	575.860000	0.720000	0.000000	0.000000	3039.115000	705.000000	709.000000	1.000000
max	0.309900	8.300000e+06	49.930000	20.000000	6.000000	63.000000	971736.000000	176.000000	5.000000	6.000000	197765.000000	3.164353e+06	7.000000	94521.000000	63.000000	1.000000	1.000000	1.000000	1.00000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.0	1.000000	1.000000	1.000000	1.00000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	40000.000000	40000.000000	8.741700e+02	1.000000	82.000000	1584.900000	8.923000	2330.970000	2330.970000	28005.960000	845.000000	850.000000	1.000000

# 결측치 확인
pd.DataFrame(df.isnull().sum()).rename(columns={0:'Non-Null Count'}).T

	int_rate	annual_inc	dti	delinq_2yrs	inq_last_6mths	pub_rec	revol_bal	total_acc	collections_12_mths_ex_med	acc_now_delinq	tot_coll_amt	tot_cur_bal	chargeoff_within_12_mths	delinq_amnt	tax_liens	emp_length1	emp_length2	emp_length3	emp_length4	emp_length5	emp_length6	emp_length7	emp_length8	emp_length9	emp_length10	emp_length11	emp_length12	home_ownership1	home_ownership2	home_ownership3	home_ownership4	home_ownership5	home_ownership6	verification_status1	verification_status2	verification_status3	purpose1	purpose2	purpose3	purpose4	purpose5	purpose6	purpose7	purpose8	purpose9	purpose10	purpose11	purpose12	purpose13	purpose14	initial_list_status1	initial_list_status2	mths_since_last_delinq1	mths_since_last_delinq2	mths_since_last_delinq3	mths_since_last_delinq4	mths_since_last_delinq5	mths_since_last_delinq6	mths_since_last_delinq7	mths_since_last_delinq8	mths_since_last_delinq9	mths_since_last_delinq10	mths_since_last_delinq11	funded_amnt	funded_amnt_inv	total_rec_late_fee	term1	open_acc	installment	revol_util	out_prncp	out_prncp_inv	total_rec_int	fico_range_low	fico_range_high	depvar
Non-Null Count	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

Feature engineering을 잘했다고 느껴진게 여기서 드러난다. 결측값(NaN)이 한개도 없었다.

아무래도 모델 튜닝에 신경쓰라는 의도인것 같았다.

# 라벨(depvar) 비율 확인
print('y=1 ratio :', df.depvar.sum()/len(df))

y=1 ratio : 0.32569

f1 score와 y=1 ratio, 마지막으로 macro f1

이 세 가지가 문제를 푸는데 정말 고민하게 했다… 끝에서 얘기를 하겠습니다.

Train Valid Split

어떤 방식으로 문제를 접근했냐면, XGBClassifier라는 모델을 선정했고,

이 모델을 학습시킬때, 총 100,000개의 데이터들을 5등분하여,

4개씩 트레이닝 하는 방법으로 진행하였습니다.

예를 들면,

dataset1, dataset2, dataset3, dataset4, dataset5 이렇게 각각 2만개의 데이터를 받고,

model1 = 1,2,3,4 (5번제외)

model2 = 1,2,3,5 (4번제외)

model3 = 1,2,4,5 (3번제외)

model4 = 1,3,4,5 (2번제외)

model5 = 2,3,4,5 (1번제외)

이런식으로 진행하여 최종 test_dataset에서는,

out1 = model1(test)

out2 = model2(test)

out3 = model3(test)

out4 = model4(test)

out5 = model5(test)

제출용 csv파일에는 5개의 모델중에서 동일한 class가 3개이상이면 해당 class로 제출하는방식을 택했습니다!!!!

(아직 못해본게 많이 있습니다. SMOTE를 통한 데이터 증식, RandomForest model을 통한 학습 등 고려할게 많았지만,

제가 시간을 삽질하는데에 허비해버려서, 대회 마감일에 허겁지겁 학습하게 되었습니다.

경험이 없어서, 모르는 용어가 많았네요 ㅜㅜ)

X = df.drop('depvar', axis=1)
y = df['depvar']

X_train1 = X[:20000]
X_train2 = X[20000:40000]
X_train3 = X[40000:60000]
X_train4 = X[60000:80000]
X_train5 = X[80000:]

y_train1 = y[:20000]
y_train2 = y[20000:40000]
y_train3 = y[40000:60000]
y_train4 = y[60000:80000]
y_train5 = y[80000:]

##########
frames = [X_train1, X_train2, X_train3, X_train4]
X_train_dataset1 = pd.concat(frames)

frames = [X_train1, X_train2, X_train3, X_train5]
X_train_dataset2 = pd.concat(frames)

frames = [X_train1, X_train2, X_train4, X_train5]
X_train_dataset3 = pd.concat(frames)

frames = [X_train1, X_train3, X_train4, X_train5]
X_train_dataset4 = pd.concat(frames)

frames = [X_train2, X_train3, X_train4, X_train5]
X_train_dataset5 = pd.concat(frames)
##########

frames = [y_train1, y_train2, y_train3, y_train4]
y_train_dataset1 = pd.concat(frames)

frames = [y_train1, y_train2, y_train3, y_train5]
y_train_dataset2 = pd.concat(frames)

frames = [y_train1, y_train2, y_train4, y_train5]
y_train_dataset3 = pd.concat(frames)

frames = [y_train1, y_train3, y_train4, y_train5]
y_train_dataset4 = pd.concat(frames)

frames = [y_train2, y_train3, y_train4, y_train5]
y_train_dataset5 = pd.concat(frames)

# training set과 validation set의 데이터 수 확인
print(f"X_train1 = {len(X_train1)}")
print(f"X_train2 = {len(X_train2)}")
print(f"X_train3 = {len(X_train3)}")
print(f"X_train4 = {len(X_train4)}")
print(f"X_train5 = {len(X_train5)}")
print()
print(f"y_train1 = {len(y_train1)}")
print(f"y_train2 = {len(y_train2)}")
print(f"y_train3 = {len(y_train3)}")
print(f"y_train4 = {len(y_train4)}")
print(f"y_train5 = {len(y_train5)}")
print()
print(f"y_train1의 label ratio check = {y_train1.sum() / len(y_train1)}")
print(f"y_train2의 label ratio check = {y_train2.sum() / len(y_train2)}")
print(f"y_train3의 label ratio check = {y_train3.sum() / len(y_train3)}")
print(f"y_train4의 label ratio check = {y_train4.sum() / len(y_train4)}")
print(f"y_train5의 label ratio check = {y_train5.sum() / len(y_train5)}")

X_train1 = 20000
X_train2 = 20000
X_train3 = 20000
X_train4 = 20000
X_train5 = 20000

y_train1 = 20000
y_train2 = 20000
y_train3 = 20000
y_train4 = 20000
y_train5 = 20000

y_train1의 label ratio check = 0.3256
y_train2의 label ratio check = 0.3226
y_train3의 label ratio check = 0.3293
y_train4의 label ratio check = 0.3329
y_train5의 label ratio check = 0.31805

중요한게 위에서 나왔군요..

제가 끝까지 고민하려 했지만, 한계를 느껴서, 이 포스팅을 마치고 공부하러 떠납니다…

(정말 모르는 용어가 너무 많고, 아직 많이 부족함을 느껴 공부를 더 해야합니다.)

끝에서도 말하겠지만, 제가 제출하려고 하는 최종 결과물의 핵심은 아래와 같습니다.

train_dataset에서의 y=1 ratio 비율이 제출하려는 test_dataset에서의 비율과 거의 같아야 합니다.
이번 대회의 평가 지표인 macro f1의 값이 높아야 합니다.
당연한 소리이지만 올바른 class를 많이 맞추어야 합니다.

이게 너무 어려웠습니다.

아직 포스팅을 작성하고 있는 시점이라서, 해결은 못했지만…

어떤 현상이 벌어지냐면요..

fi score가 높으면, y=1 ratio의 값이 너무나도 튀게 됩니다.

ex) f1 score 0.666, y=1 ratio 0.5

f1 score가 precision과 recall에 영향을 받고,

Macro f1은 f1 score에 영향을 받습니다.

그러면 precision과 recall의 값이 높으면 되는데, 단순히 이 값만을 높이게 되면,

y=1 ratio가 튀게되어서 결과적으로 leaderboard에서도 낮은 점수를 기록하게 되었습니다.

(참고로 y=1 ratio가 튀게되어서 점수가 낮아진다고 추측하고있는것 뿐입니다.)

밑에서마저 얘기해보겠습니다.

X_train_dataset4

	int_rate	annual_inc	dti	delinq_2yrs	inq_last_6mths	pub_rec	revol_bal	total_acc	collections_12_mths_ex_med	acc_now_delinq	tot_coll_amt	tot_cur_bal	chargeoff_within_12_mths	delinq_amnt	tax_liens	emp_length1	emp_length2	emp_length3	emp_length4	emp_length5	emp_length6	emp_length7	emp_length8	emp_length9	emp_length10	emp_length11	emp_length12	home_ownership1	home_ownership2	home_ownership3	home_ownership4	home_ownership5	home_ownership6	verification_status1	verification_status2	verification_status3	purpose1	purpose2	purpose3	purpose4	purpose5	purpose6	purpose7	purpose8	purpose9	purpose10	purpose11	purpose12	purpose13	purpose14	initial_list_status1	initial_list_status2	mths_since_last_delinq1	mths_since_last_delinq2	mths_since_last_delinq3	mths_since_last_delinq4	mths_since_last_delinq5	mths_since_last_delinq6	mths_since_last_delinq7	mths_since_last_delinq8	mths_since_last_delinq9	mths_since_last_delinq10	mths_since_last_delinq11	funded_amnt	funded_amnt_inv	total_rec_late_fee	term1	open_acc	installment	revol_util	out_prncp	out_prncp_inv	total_rec_int	fico_range_low	fico_range_high
0	0.0824	21000.0	29.19	0	1	0	3016	26	0	0	0	11773	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	1200	1200.0	0.0	1	18	37.74	0.076	0.0	0.0	157.94	765	769
1	0.1299	80000.0	4.82	0	1	1	5722	24	0	0	0	21875	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	8000	8000.0	0.0	1	8	269.52	0.447	0.0	0.0	1702.42	665	669
2	0.1299	38000.0	23.66	0	3	0	6511	18	0	0	0	31868	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	1	5000	5000.0	0.0	1	7	168.45	0.880	0.0	0.0	1066.64	670	674
3	0.1367	100000.0	16.27	4	2	0	6849	30	0	0	0	326049	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	15000	15000.0	0.0	1	12	510.27	0.457	0.0	0.0	1256.24	680	684
4	0.1269	30000.0	25.28	0	1	2	8197	12	0	0	2506	8840	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	10000	10000.0	0.0	1	8	335.45	0.416	0.0	0.0	871.04	660	664
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
99995	0.1757	65000.0	17.67	0	3	1	11255	21	1	0	0	26570	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	20000	20000.0	0.0	1	13	718.75	0.780	0.0	0.0	5373.29	660	664
99996	0.0890	65000.0	2.88	0	0	0	2105	12	0	0	0	6138	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	6000	6000.0	0.0	1	7	190.52	0.120	0.0	0.0	835.66	765	769
99997	0.1349	46000.0	32.12	0	1	0	8998	20	0	0	0	96531	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	6400	6400.0	0.0	1	19	217.16	0.643	0.0	0.0	1261.67	665	669
99998	0.2115	31000.0	4.53	0	1	0	3875	4	0	0	0	3875	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	5500	5500.0	0.0	1	3	207.64	0.731	0.0	0.0	1357.69	710	714
99999	0.1599	125000.0	33.33	0	0	0	34580	30	0	0	0	422626	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	33125	33125.0	0.0	1	19	1164.42	0.499	0.0	0.0	8882.58	690	694

80000 rows × 75 columns

Single Model(XGBoost)

XGBClassifier의 하이퍼 파라미터 설명

LINK : https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier
max_depth=3 : 디시전 트리의 최대 깊이
learning_rate=0.1 : 0과 1 사이의 값을 가지는 부스팅에 대한 학습률(eta). 매 부스팅 단계 이후 새로이 추가된 가중치는 이 파라미터로 조정된다. 이 값이 낮을수록 보수적이며, 수렴에 필요한 더 많은 디시전 트리가 필요하다.
n_estimators=100 : 라운드의 횟수 또는 부스팅된 디시전 트리의 개수
silent=True : 부스팅의 수행 동안 메시지를 출력할지에 대한 여부
objective=”reg:linear” : 모델이 학습할 작업의 분류, 미리 정의된 작업은 문자열로 지정이 가능하지만, 그렇지 않은 경우 콜러블callable을 만들어서 지정할 수도 있다.
booster=”gbtree” : ‘gbtree’, ‘gblinear’, ‘dart’일 수 있다. ‘dart’는 드롭아웃(dropout)이라는 특성을 추가한다(과적합 방지를 위해 무작위로 디시전 트리를 선택해 제거(드롭)한다). ‘gblinear’는 정규화된 선형 모델을 만든다(디시전 트리가 아니라 라소 회귀와 유사하다).
nthread=None : 더 이상 사용되지 않는다.
n_jobs : 사용할 스레드의 개수
gamma=0 : 노드 분할에 필요한 최소 손실 감소
min_child_weight=1 : 자식 노드 생성에 필요한 헤시안(hessian) 합의 최솟값
max_delta_step=0 : 보다 보수적으로 갱신을 수행하도록 만드는 값. 불균형 범주의 데이터셋에 대해서는 1부터 10까지의 값으로 설정한다.
subsample=1 : 부스팅에 사용할 샘플의 비율
colsample_bytree=1 : 부스팅에 사용할 특징 열의 비율
colsample_bylevel=1 : 각 디시전 트리의 수준별 사용할 특징 열의 비율
colsample_bynode=1 : 각 디시전 트리의 노드별 사용할 특징 열의 비율
reg_alpha=0 : L1 정규화(가중치의 평균). 이 값이 클수록 보수적이게 된다.
reg_lambda=1 : L2 정규화(가중치의 제곱근). 이 값이 클수록 보수적이게 된다.
base_score=.5 : 초기 편향치(bias)
seed=None : 더 이상 사용되지 않는다.
random_state=0 : 난수 생성 시드
missing=None : 누락된 데이터가 해석될 값. None은 np.nan을 의미한다.
importance_type=’gain’ : 특징 중요도의 유형. ‘gain’, ‘weight’, ‘cover’, ‘total_gain’, ‘total_cover’로 설정될 수 있다.

# # 하이퍼 파라미터 튜닝
# xgb_clf = xgboost.XGBClassifier()

# param_grid = {'max_depth':[5,10],
#               'n_estimators':[50, 100],
#               }

# cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

# grid_search = GridSearchCV(estimator=xgb_clf,
#                            param_grid=param_grid, 
#                            n_jobs=-1,
#                            cv=cv,
#                            scoring='accuracy', 
#                            error_score=0) 

# results=grid_search.fit(X_train_dataset1, y_train_dataset1)

# results.best_params_

# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_valid = sc.transform(X_valid)

XGBoost hyperparameter tuning

hyperparameter tuning을 진행하면서, plot이 사라진게 있습니다.

시간이 오래걸려서 중간에 멈춰버린게 많았거든요.ㅜㅜ

그래도 값들은 직접 구해본것들입니다.!!!

이 부분은 “튜닝을 이런식으로 했구나” 정도만 보고 지나가셔도 될것같습니다.

tuning_dataset = df[:80000]
tuning_testset = df[80000:]
idx = [ i for i in range(80000)]
tuning_dataset["Id"] = idx
tuning_dataset

	int_rate	annual_inc	dti	delinq_2yrs	inq_last_6mths	pub_rec	revol_bal	total_acc	collections_12_mths_ex_med	acc_now_delinq	tot_coll_amt	tot_cur_bal	chargeoff_within_12_mths	delinq_amnt	tax_liens	emp_length1	emp_length2	emp_length3	emp_length4	emp_length5	emp_length6	emp_length7	emp_length8	emp_length9	emp_length10	emp_length11	emp_length12	home_ownership1	home_ownership2	home_ownership3	home_ownership4	home_ownership5	home_ownership6	verification_status1	verification_status2	verification_status3	purpose1	purpose2	purpose3	purpose4	purpose5	purpose6	purpose7	purpose8	purpose9	purpose10	purpose11	purpose12	purpose13	purpose14	initial_list_status1	initial_list_status2	mths_since_last_delinq1	mths_since_last_delinq2	mths_since_last_delinq3	mths_since_last_delinq4	mths_since_last_delinq5	mths_since_last_delinq6	mths_since_last_delinq7	mths_since_last_delinq8	mths_since_last_delinq9	mths_since_last_delinq10	mths_since_last_delinq11	funded_amnt	funded_amnt_inv	total_rec_late_fee	term1	open_acc	installment	revol_util	out_prncp	out_prncp_inv	total_rec_int	fico_range_low	fico_range_high	depvar	Id
0	0.0824	21000.0	29.19	0	1	0	3016	26	0	0	0	11773	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	1200	1200.0	0.0	1	18	37.74	0.076	0.0	0.0	157.94	765	769	0	0
1	0.1299	80000.0	4.82	0	1	1	5722	24	0	0	0	21875	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	8000	8000.0	0.0	1	8	269.52	0.447	0.0	0.0	1702.42	665	669	0	1
2	0.1299	38000.0	23.66	0	3	0	6511	18	0	0	0	31868	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	1	5000	5000.0	0.0	1	7	168.45	0.880	0.0	0.0	1066.64	670	674	0	2
3	0.1367	100000.0	16.27	4	2	0	6849	30	0	0	0	326049	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	15000	15000.0	0.0	1	12	510.27	0.457	0.0	0.0	1256.24	680	684	1	3
4	0.1269	30000.0	25.28	0	1	2	8197	12	0	0	2506	8840	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	10000	10000.0	0.0	1	8	335.45	0.416	0.0	0.0	871.04	660	664	1	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
79995	0.1333	114480.0	7.81	0	1	1	5680	42	0	0	108	340793	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	0	10800	10800.0	0.0	1	18	365.62	0.458	0.0	0.0	1868.28	670	674	0	79995
79996	0.0699	88800.0	7.91	0	0	0	9617	11	0	0	0	230979	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	9800	9800.0	0.0	1	4	302.56	0.641	0.0	0.0	0.00	725	729	1	79996
79997	0.1299	32000.0	5.96	0	1	0	952	6	0	0	0	4410	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	5600	5600.0	0.0	1	4	188.66	0.090	0.0	0.0	1188.09	745	749	0	79997
79998	0.1333	60000.0	18.58	0	0	0	31287	14	0	0	0	35108	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	1	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	1	0	0	0	27600	27600.0	0.0	1	7	934.35	0.711	0.0	0.0	6015.99	705	709	0	79998
79999	0.2099	40000.0	22.92	0	0	0	2777	22	0	0	0	23758	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	11125	11125.0	0.0	0	10	300.91	0.237	0.0	0.0	6743.82	705	709	0	79999

80000 rows × 77 columns

def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=100):
   
    # get new n_estimator
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgboost.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='error', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
        print(alg)
    
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['depvar'], eval_metric='error')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Training Accuracy : %.4g" % metrics.accuracy_score(dtrain['depvar'].values, dtrain_predictions))

target = 'depvar'
IDcol = 'Id'

# n_estimators를 1000으로 두면 왜 training accuracy가 더 떨어질까? 100보다.
predictors = [x for x in tuning_dataset.columns if x not in [target, IDcol]]
model1 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 1,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205                                
                                )
modelfit(model1, tuning_dataset, predictors)

n_estimators fix

1.0.2

## GPU 쓰고 싶다.
# tree_method='gpu_hist',
# predictor='gpu_predictor'


cv_scores = list()
estimator_list = [i for i in range(100, 1000, 50)]
for i in tqdm(range(100, 1000, 50)):
  xgbc = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = i,
                                max_depth = 5,
                                min_child_weight = 1,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor'
                                )
  score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="f1_macro").mean()
  cv_scores.append(score)

  best_e = [estimator_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]

plt.figure(figsize=(20,10))
plt.plot(estimator_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)

print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 350
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")

Max depth

cv_scores = list()
max_depth_list = [5, 7, 9]
for i in tqdm(max_depth_list):
  xgbc = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 350,
                                max_depth = i,
                                min_child_weight = 1,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')
  score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="f1_macro").mean()
  cv_scores.append(score)

  best_e = [max_depth_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]

plt.figure(figsize=(20,10))
plt.plot(max_depth_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)

print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 9
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")

gamma

cv_scores = list()
gamma_list = [0.0, 0.1, 0.2, 0.3, 0.4]
for i in tqdm(gamma_list):
  xgbc = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 350,
                                max_depth = 9,
                                min_child_weight = 1,
                                gamma = i,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                eval_metric='error',
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')
  score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="accuracy").mean()
  cv_scores.append(score)

  best_e = [gamma_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]

100%|██████████| 5/5 [04:19<00:00, 51.92s/it]

plt.figure(figsize=(20,10))
plt.plot(gamma_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)

<matplotlib.lines.Line2D at 0x7f5c64528c90>

print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")

optimizer tree count : 0 
Train set에 대한 성능 : 0.7508

min_child_weight

cv_scores = list()
min_child_weight_list = [i for i in range(1,10,2)]
for i in tqdm(range(1,10,2)):
  xgbc = xgboost.XGBClassifier(learning_rate = 0.1,
                                n_estimators = 350,
                                max_depth = 5,
                                min_child_weight = i,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                eval_metric='error',
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')
  score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="accuracy").mean()
  cv_scores.append(score)

  best_e = [min_child_weight_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]

100%|██████████| 5/5 [01:20<00:00, 16.01s/it]

plt.figure(figsize=(20,10))
plt.plot(min_child_weight_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)

<matplotlib.lines.Line2D at 0x7f5c4b3b71d0>

print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 1
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")

optimizer tree count : 0 
Train set에 대한 성능 : 0.7541

XGBoost Model Training

위에서 얻어진 값들은 모델의 학습 파라미터로 주고 있습니다.

tree_method='gpu_hist',

predictor='gpu_predictor'

이 부분은 gpu로 학습시킨다는 의미이며, gpu가 없을시 저 부분을 삭제하고 진행하시면 됩니다.

Train Valid Split 에서 설명한대로 학습을 진행하고 있습니다.

model1 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 350,
                                max_depth = 9,
                                min_child_weight = 1,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')


model2 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 10,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')

model3 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 10,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')

model4 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 10,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')

model5 = xgboost.XGBClassifier(learning_rate = 0.01,
                                n_estimators = 1000,
                                max_depth = 5,
                                min_child_weight = 10,
                                gamma = 0,
                                subsample = 0.8,
                                colsample_bytree = 0.8,
                                objective="binary:logistic",
                                nthread= -1,
                                scale_pos_weight = 1,
                                seed=220205,
                                tree_method='gpu_hist',
                                predictor='gpu_predictor')

model1.fit(X_train_dataset1, y_train_dataset1)
model2.fit(X_train_dataset2, y_train_dataset2)
model3.fit(X_train_dataset3, y_train_dataset3)
model4.fit(X_train_dataset4, y_train_dataset4)
model5.fit(X_train_dataset5, y_train_dataset5)

XGBClassifier(colsample_bytree=0.8, learning_rate=0.01, max_depth=5,
              min_child_weight=10, n_estimators=1000, nthread=-1,
              predictor='gpu_predictor', seed=220205, subsample=0.8,
              tree_method='gpu_hist')

y_pred1 = model1.predict(X_train5)
y_pred2 = model2.predict(X_train4)
y_pred3 = model3.predict(X_train3)
y_pred4 = model4.predict(X_train2)
y_pred5 = model5.predict(X_train1)

# y_valid가 0 또는 1일 확률 출력
y_prob1 = model1.predict_proba(X_train5)
y_prob2 = model2.predict_proba(X_train4)
y_prob3 = model3.predict_proba(X_train3)
y_prob4 = model4.predict_proba(X_train2)
y_prob5 = model5.predict_proba(X_train1)

# 1로 예측된 y_valid 갯수 및 비율 출력
print("thr > 0.5")
print(f" model1의 예측비율 : {y_pred1.sum() / len(y_pred1)}")
print(f" model2의 예측비율 : {y_pred2.sum() / len(y_pred2)}")
print(f" model3의 예측비율 : {y_pred3.sum() / len(y_pred3)}")
print(f" model4의 예측비율 : {y_pred4.sum() / len(y_pred4)}")
print(f" model5의 예측비율 : {y_pred5.sum() / len(y_pred5)}")

# thr 0.3보다 큰 경우
thr = 0.38
print(f"thr > {thr}")
pred1 = (y_prob1[:,1] >= thr).astype(np.int64)
pred2 = (y_prob2[:,1] >= thr).astype(np.int64)
pred3 = (y_prob3[:,1] >= thr).astype(np.int64)
pred4 = (y_prob4[:,1] >= thr).astype(np.int64)
pred5 = (y_prob5[:,1] >= thr).astype(np.int64)
print(f" model1의 예측비율 : {pred1.sum() / len(pred1)}")
print(f" model1의 예측비율 : {pred2.sum() / len(pred2)}")
print(f" model1의 예측비율 : {pred3.sum() / len(pred3)}")
print(f" model1의 예측비율 : {pred4.sum() / len(pred4)}")
print(f" model1의 예측비율 : {pred5.sum() / len(pred5)}")

thr > 0.5
 model1의 예측비율 : 0.2039
 model2의 예측비율 : 0.2112
 model3의 예측비율 : 0.2054
 model4의 예측비율 : 0.21025
 model5의 예측비율 : 0.2036
thr > 0.38
 model1의 예측비율 : 0.3277
 model1의 예측비율 : 0.3358
 model1의 예측비율 : 0.32705
 model1의 예측비율 : 0.33205
 model1의 예측비율 : 0.32595

여기에서 보면 predict()로 구하게 되면 thr값이 0.5가 default여서 y=1 ratio의 값이 train_dataset과 차이가 많이 나게 됩니다.

그래서 predict_proba()로 구해서 thr의 값을 수정해주는 작업을 진행해 주어야합니다.

사실 thr 값을 제가 이렇게 바꾸는게, 기껏 모델 학습시켜놓은것을 망친다는 생각도 했으며,

모델을 학습시킬대 thr값을 제가 변경해주면서 학습시키는것이 가능한가?

이 부분에 대한 공부도 필요합니다.

docs도 뒤져보고 있는데, 능력부족으로 파악하지 못했습니다.

직접 구현해야 하는것인지 ㅜㅜ

# 평가 함수 정의
def get_clf_eval(y_actual, y_pred):
    accuracy = accuracy_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred)
    recall = recall_score(y_actual, y_pred)
    AUC = roc_auc_score(y_actual, y_pred)
    F1 = f1_score(y_actual, y_pred)
    print('\naccuracy: {:.4f}'.format(accuracy))
    print('precision: {:.4f}'.format(precision))
    print('recall: {:.4f}'.format(recall))
    print('AUC: {:.4f}'.format(AUC))
    print('F1: {:.4f}'.format(F1))
    
    # sns.heatmap(confusion_matrix(y_actual, y_pred), annot=True, fmt='d', cmap='YlGnBu')

# xgboost 성능 확인
get_clf_eval(y_train5, pred1)
get_clf_eval(y_train4, pred2)
get_clf_eval(y_train3, pred3)
get_clf_eval(y_train2, pred4)
get_clf_eval(y_train1, pred5)


accuracy: 0.7500
precision: 0.6038
recall: 0.6221
AUC: 0.7158
F1: 0.6128

accuracy: 0.7428
precision: 0.6127
recall: 0.6181
AUC: 0.7116
F1: 0.6154

accuracy: 0.7459
precision: 0.6149
recall: 0.6107
AUC: 0.7115
F1: 0.6128

accuracy: 0.7456
precision: 0.6026
recall: 0.6203
AUC: 0.7127
F1: 0.6113

accuracy: 0.7435
precision: 0.6059
recall: 0.6066
AUC: 0.7081
F1: 0.6062

만족스러운 결과물은 아닙니다…

MNIST나 sklearn에서 제공하는 다른 데이터들에서는 항상 학습시키면 90이라는 숫자가 넘게 나왔는데,

train_dataset에서 학습을 시키는데도 불구하고 80을 못넘겼습니다…

test에서는 얼마나 더 떨어질지 ㅠㅠ

Randomforest hyperparameter tuning

Randomforest Model Training

Submission

# 제출 양식 다운로드
submit = pd.read_csv('/content/drive/MyDrive/22-01-28/sample_submission.csv')

# prediction 수행
df_test = pd.read_csv('/content/drive/MyDrive/22-01-28/test.csv')

test = df_test.drop('ID', axis=1)

test

	int_rate	annual_inc	dti	delinq_2yrs	inq_last_6mths	pub_rec	revol_bal	total_acc	collections_12_mths_ex_med	acc_now_delinq	tot_coll_amt	tot_cur_bal	chargeoff_within_12_mths	delinq_amnt	tax_liens	emp_length1	emp_length2	emp_length3	emp_length4	emp_length5	emp_length6	emp_length7	emp_length8	emp_length9	emp_length10	emp_length11	emp_length12	home_ownership1	home_ownership2	home_ownership3	home_ownership4	home_ownership5	home_ownership6	verification_status1	verification_status2	verification_status3	purpose1	purpose2	purpose3	purpose4	purpose5	purpose6	purpose7	purpose8	purpose9	purpose10	purpose11	purpose12	purpose13	purpose14	initial_list_status1	initial_list_status2	mths_since_last_delinq1	mths_since_last_delinq2	mths_since_last_delinq3	mths_since_last_delinq4	mths_since_last_delinq5	mths_since_last_delinq6	mths_since_last_delinq7	mths_since_last_delinq8	mths_since_last_delinq9	mths_since_last_delinq10	mths_since_last_delinq11	funded_amnt	funded_amnt_inv	total_rec_late_fee	term1	open_acc	installment	revol_util	out_prncp	out_prncp_inv	total_rec_int	fico_range_low	fico_range_high
0	0.1449	16380.0	26.08	0	0	1	3486	10	0	0	0	9214	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	8100	8100.0	0.00	1	4	278.78	0.311	0.0	0.0	460.40	700	704
1	0.1899	65000.0	13.97	0	0	0	25305	20	0	0	0	115612	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	20000	20000.0	0.00	0	10	518.71	0.885	0.0	0.0	4866.68	675	679
2	0.1049	53000.0	23.28	0	0	0	10910	21	0	0	0	33017	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	0	0	10000	10000.0	16.25	1	7	324.98	0.580	0.0	0.0	1451.06	675	679
3	0.1757	71800.0	30.32	0	0	0	42423	26	0	0	0	152515	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	27200	27200.0	0.00	0	16	684.36	0.701	0.0	0.0	7068.11	665	669
4	0.2020	50000.0	25.61	0	2	0	21703	24	0	0	0	135282	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	22000	22000.0	0.00	0	13	585.32	0.622	0.0	0.0	7754.20	710	714
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
35811	0.1049	110000.0	9.02	0	0	1	8991	33	0	0	0	17468	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	12000	12000.0	0.00	1	13	389.98	0.346	0.0	0.0	1406.13	665	669
35812	0.0824	45000.0	32.56	1	0	0	20966	29	0	0	0	24802	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	1	0	0	1	0	0	0	0	0	0	0	0	5000	5000.0	0.00	1	9	157.24	0.690	0.0	0.0	495.70	705	709
35813	0.1368	49000.0	17.60	0	0	0	5597	7	0	0	0	20691	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0	0	0	0	0	0	6000	6000.0	0.00	1	7	204.14	0.333	0.0	0.0	1355.06	705	709
35814	0.1139	84852.0	9.96	0	0	1	7184	13	0	0	0	31396	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	10000	10000.0	0.00	1	8	329.24	0.352	0.0	0.0	1615.15	725	729
35815	0.1274	60000.0	3.64	0	0	0	77	7	0	0	1468	178885	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0	0	1	0	0	0	0	0	1	0	0	0	0	1	0	0	0	0	0	0	1500	1500.0	0.00	1	3	50.36	0.770	0.0	0.0	311.08	675	679

35816 rows × 75 columns

# out1 = model1.predict(test)
# out2 = model2.predict(test)
# out3 = model3.predict(test)
# out4 = model4.predict(test)
# out5 = model5.predict(test)


prob1 = model1.predict_proba(test)
out1 = (prob1[:,1] >= thr).astype(np.int64)

prob2 = model2.predict_proba(test)
out2 = (prob2[:,1] >= thr).astype(np.int64)

prob3 = model3.predict_proba(test)
out3 = (prob3[:,1] >= thr).astype(np.int64)

prob4 = model4.predict_proba(test)
out4 = (prob4[:,1] >= thr).astype(np.int64)

prob5 = model5.predict_proba(test)
out5 = (prob5[:,1] >= thr).astype(np.int64)




print(out1.sum() / len(out1))
print(out2.sum() / len(out2))
print(out3.sum() / len(out3))
print(out4.sum() / len(out4))
print(out5.sum() / len(out5))

0.3358554835827563
0.334682823319187
0.332449184721912
0.3370002233638597
0.3345432209068573

result = []
for i in range(len(test)):
  if 3 == (out1[i] + out2[i] + out3[i] + out4[i] + out5[i]):
    result.append(1)
  elif 4 == (out1[i] + out2[i] + out3[i] + out4[i] + out5[i]):
    result.append(1)
  elif 5 == (out1[i] + out2[i] + out3[i] + out4[i] + out5[i]):
    result.append(1)
  else:
    result.append(0)

    
  

print(f" 최종 결과물!!! ")
print(f" {sum(result) / len(result)} ")
print(len(result))
print(result.count(1))

submit["answer"] = result  

 최종 결과물!!! 
 0.3345153004243913 
35816
11981

# 제출 파일 저장
submit.to_csv('/content/drive/MyDrive/22-02-05/2022-02-05-14.csv', index=False)

y=1 ratio가 우선순위 1번이며,

f1 score가 우선순위 2번으로,

모델을 수정하고, 학습시켰습니다.

대회 마감일날, 최종 제출할때도 위의 2개의 원칙을 지켜서 제출해보려 합니다.(공부를 해서 뭔가를 알아내기 전까지는 그대로일듯 합니다.)

아래의 주석들은 제가 제출한 파일들의 점수와 hyperparameter들입니다.

끝으로..

공부할게 참 많습니다.!!!!

##########################################3
# CSV file 설명
# 2022-02-05-1 : hyperparameter tuning 하지 않았고, predict(thrhold:0.5)로만 하였음.
# 2022-02-05-2 : hyperparameter tuning 하지 않았고, predict(thrhold:0.42)로만 하였음.
# 2022-02-05-3 : hyperparameter tuning 한 결과, predict(thrhold:0.42)로만 하였음.(어떻게 튜닝했는데 더 떨어지지?)
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-4 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로만 하였음.(최고기록: 0.717169)
# 2022-02-05-5 : hyperparameter tuning 한 결과, predict(thrhold:0.35)로 하였음. 0.35로 했더니 ratio of label는 망가졌으나(0.39), fi score가 높음! 0.637~0.638
# 2022-02-05-6 : hyperparameter tuning 한 결과, predict(thrhold:0.35)로 하였음.
# score는 713982
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 1000,
#                                 max_depth = 5,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 eval_metric='error',
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-7 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로 하였음.
# score는 713330
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 1000,
#                                 max_depth = 5,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 eval_metric='error',
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-8 : hyperparameter tuning 한 결과, predict(thrhold:0.4)로 하였음.
# score는 713574
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 eval_metric='error',
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-9 : hyperparameter tuning 한 결과, predict(thrhold:0.405)로 하였음.
# score는 712255
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-10 : hyperparameter tuning 한 결과, predict(thrhold:0.4)로 하였음.
# score는 713574
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-11 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로 하였음.
# score는 716175
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-12 : hyperparameter tuning 한 결과, predict(thrhold:0.36)로 하였음.
# score는 716011
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 350,
#                                 max_depth = 9,
#                                 min_child_weight = 1,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')
# 2022-02-05-13 : hyperparameter tuning 한 결과, predict(thrhold:0.38-0.4)로 하였음.
# score는 713264
# model1만 다르게 해본거.
# 2022-02-05-14 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로 하였음.
# score는 713171
# xgboost.XGBClassifier(learning_rate = 0.01,
#                                 n_estimators = 1000,
#                                 max_depth = 5,
#                                 min_child_weight = 10,
#                                 gamma = 0,
#                                 subsample = 0.8,
#                                 colsample_bytree = 0.8,
#                                 objective="binary:logistic",
#                                 nthread= -1,
#                                 scale_pos_weight = 1,
#                                 seed=220205,
#                                 tree_method='gpu_hist',
#                                 predictor='gpu_predictor')

Twitter Facebook LinkedIn