[MNC] 대출자 채무 불이행 여부 예측 모델
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, RidgeCV, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
EDA
변수 설명
-
int_rate : 대출자에 부여된 이자율 (Interest rate of the loan the applicant received)
-
annual_inc : 연 소득 (annual income)
-
dti : 소득 대비 부채 비율 (Debt-to-income ratio)
-
delinq_2yrs : 지난 2년 간 체납 발생 횟수 (Delinquencies on lines of credit in the last 2 years)
-
inq_last_6mths : 지난 6개월 간 신용 조회 수 (Inquiries into the applicant’s credit during the last 6 months)
-
pub_rec : 파산 횟수 (Number of bankruptcies listed in the public record)
-
revol_bal : 리볼빙 잔액 (Total credit revolving balance)
-
total_acc : 지금까지 소유했던 신용카드 개수 (num_total_cc_accounts : Total number of credit card accounts in the applicant’s history)
-
collections_12_mths_ex_med : 의료부문을 제외한 지난 12개월 간 추심 발생 횟수 (num_collections_last_12m : Number of collections in the last 12 months. This excludes medical collections)
-
acc_now_delinq : 대출자가 체납 상태에 있지 않은 계좌의 수 (The number of accounts on which the borrower is now delinquent)
-
tot_coll_amt : 대출자에 대한 현재까지의 총 추심액 (total_collection_amount_ever : The total amount that the applicant has had against them in collections)
-
tot_cur_bal : 전 계좌의 현재 통합 잔고 (Total current balance of all accounts)
-
chargeoff_within_12_mths : 대출 부 신청인의 대출 신청 직전 12개월 간 세금 공제 횟수 (Number of charge-offs within last 12 months at time of application for the secondary applicant)
-
delinq_amnt : 체납 금액 (delinquency amount)
-
tax_liens : 세금 저당권의 수 (Number of tax liens)
-
emp_length1 ~ 12 : 고용 연수 (Number of years in the job)
-
home_ownership1 ~ 6 : 대출 신청자의 주거 소유 형태 (The ownership status of the applicant’s residence)
-
verification_status1 ~ 3 : 공동 소득 발생 여부 및 형태 (verification_income_joint : Type of verification of the joint income)
-
purpose1 ~ 14 : 대출 목적 (The purpose of the loan)
-
initial_list_status1 ~ 2 : 최초 대출 상태 (Initial listing status of the loan)
-
mths_since_last_delinq1 ~ 11 : 마지막 체납이 지금으로부터 몇개월 전에 있었는지를 나타내는 변수 (Months since the last delinquency)
-
funded_amnt : 대출액 (Funded amount)
-
funded_amnt_inv : 사채 대출액 (Funded amount by investors)
-
total_rec_late_fee : 총 연체료 중 납부액 (Late fees received to date)
-
term1 : 상환 기간 (The number of payments on the loan. Values are in months and can be either 36 or 60)
-
open_acc : 개설 개좌 수 (The number of open credit lines in the borrower’s credit file)
-
installment : 대출 발생 시 월 상환액 (The monthly payment owed by the borrower if the loan originates)
-
revol_util : 리볼빙 한도 대비 리볼빙 사용 비율 (Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit)
-
out_prncp : 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded)
-
out_prncp_inv : 사채 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded by investors)
-
total_rec_int : 이자 상환액 (Interest received to date)
-
fico_range_low : FICO(일종의 신용점수) 최저값 (The lower boundary range the borrower’s FICO at loan origination belongs to)
-
fico_range_high : FICO(일종의 신용점수) 최고값 (The upper boundary range the borrower’s FICO at loan origination belongs to)
-
depvar : 고객의 부도 여부 (dependent variable)
df = pd.read_csv('/content/drive/MyDrive/22-01-28/train.csv')
df.head()
int_rate | annual_inc | dti | delinq_2yrs | inq_last_6mths | pub_rec | revol_bal | total_acc | collections_12_mths_ex_med | acc_now_delinq | tot_coll_amt | tot_cur_bal | chargeoff_within_12_mths | delinq_amnt | tax_liens | emp_length1 | emp_length2 | emp_length3 | emp_length4 | emp_length5 | emp_length6 | emp_length7 | emp_length8 | emp_length9 | emp_length10 | emp_length11 | emp_length12 | home_ownership1 | home_ownership2 | home_ownership3 | home_ownership4 | home_ownership5 | home_ownership6 | verification_status1 | verification_status2 | verification_status3 | purpose1 | purpose2 | purpose3 | purpose4 | purpose5 | purpose6 | purpose7 | purpose8 | purpose9 | purpose10 | purpose11 | purpose12 | purpose13 | purpose14 | initial_list_status1 | initial_list_status2 | mths_since_last_delinq1 | mths_since_last_delinq2 | mths_since_last_delinq3 | mths_since_last_delinq4 | mths_since_last_delinq5 | mths_since_last_delinq6 | mths_since_last_delinq7 | mths_since_last_delinq8 | mths_since_last_delinq9 | mths_since_last_delinq10 | mths_since_last_delinq11 | funded_amnt | funded_amnt_inv | total_rec_late_fee | term1 | open_acc | installment | revol_util | out_prncp | out_prncp_inv | total_rec_int | fico_range_low | fico_range_high | depvar | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0824 | 21000.0 | 29.19 | 0 | 1 | 0 | 3016 | 26 | 0 | 0 | 0 | 11773 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1200 | 1200.0 | 0.0 | 1 | 18 | 37.74 | 0.076 | 0.0 | 0.0 | 157.94 | 765 | 769 | 0 |
1 | 0.1299 | 80000.0 | 4.82 | 0 | 1 | 1 | 5722 | 24 | 0 | 0 | 0 | 21875 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8000 | 8000.0 | 0.0 | 1 | 8 | 269.52 | 0.447 | 0.0 | 0.0 | 1702.42 | 665 | 669 | 0 |
2 | 0.1299 | 38000.0 | 23.66 | 0 | 3 | 0 | 6511 | 18 | 0 | 0 | 0 | 31868 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5000 | 5000.0 | 0.0 | 1 | 7 | 168.45 | 0.880 | 0.0 | 0.0 | 1066.64 | 670 | 674 | 0 |
3 | 0.1367 | 100000.0 | 16.27 | 4 | 2 | 0 | 6849 | 30 | 0 | 0 | 0 | 326049 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 15000 | 15000.0 | 0.0 | 1 | 12 | 510.27 | 0.457 | 0.0 | 0.0 | 1256.24 | 680 | 684 | 1 |
4 | 0.1269 | 30000.0 | 25.28 | 0 | 1 | 2 | 8197 | 12 | 0 | 0 | 2506 | 8840 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10000 | 10000.0 | 0.0 | 1 | 8 | 335.45 | 0.416 | 0.0 | 0.0 | 871.04 | 660 | 664 | 1 |
기본적으로 MNC에서 제공한 데이터는 깔끔했다. Feature engineering도 잘했다.
(처음에는 데이터를 받았으니, Feature engineering을 한다고, 시간을 많이 보냈는데, 잘했다…)
df.describe()
int_rate | annual_inc | dti | delinq_2yrs | inq_last_6mths | pub_rec | revol_bal | total_acc | collections_12_mths_ex_med | acc_now_delinq | tot_coll_amt | tot_cur_bal | chargeoff_within_12_mths | delinq_amnt | tax_liens | emp_length1 | emp_length2 | emp_length3 | emp_length4 | emp_length5 | emp_length6 | emp_length7 | emp_length8 | emp_length9 | emp_length10 | emp_length11 | emp_length12 | home_ownership1 | home_ownership2 | home_ownership3 | home_ownership4 | home_ownership5 | home_ownership6 | verification_status1 | verification_status2 | verification_status3 | purpose1 | purpose2 | purpose3 | purpose4 | purpose5 | purpose6 | purpose7 | purpose8 | purpose9 | purpose10 | purpose11 | purpose12 | purpose13 | purpose14 | initial_list_status1 | initial_list_status2 | mths_since_last_delinq1 | mths_since_last_delinq2 | mths_since_last_delinq3 | mths_since_last_delinq4 | mths_since_last_delinq5 | mths_since_last_delinq6 | mths_since_last_delinq7 | mths_since_last_delinq8 | mths_since_last_delinq9 | mths_since_last_delinq10 | mths_since_last_delinq11 | funded_amnt | funded_amnt_inv | total_rec_late_fee | term1 | open_acc | installment | revol_util | out_prncp | out_prncp_inv | total_rec_int | fico_range_low | fico_range_high | depvar | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 100000.000000 | 1.000000e+05 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 1.000000e+05 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.00000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.0 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.00000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 1.000000e+05 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 | 100000.000000 |
mean | 0.130833 | 7.436061e+04 | 18.514508 | 0.343700 | 0.668880 | 0.235720 | 16090.202820 | 24.675910 | 0.018810 | 0.005800 | 250.588120 | 1.305281e+05 | 0.009630 | 19.288330 | 0.063820 | 0.065400 | 0.326240 | 0.089410 | 0.08058 | 0.059480 | 0.060710 | 0.046640 | 0.045320 | 0.045690 | 0.038570 | 0.075550 | 0.066410 | 0.000330 | 0.466120 | 0.000070 | 0.000050 | 0.110880 | 0.422550 | 0.321800 | 0.390200 | 0.288000 | 0.009620 | 0.226720 | 0.582810 | 0.0 | 0.060400 | 0.004250 | 0.019830 | 0.01101 | 0.007310 | 0.058200 | 0.000650 | 0.011250 | 0.007070 | 0.000880 | 0.434430 | 0.565570 | 0.489320 | 0.052360 | 0.060090 | 0.045700 | 0.050750 | 0.048560 | 0.051340 | 0.053430 | 0.048960 | 0.050030 | 0.049460 | 13735.317750 | 13729.341073 | 2.448885e+00 | 0.855920 | 11.620700 | 434.077648 | 0.537233 | 0.253327 | 0.253259 | 2491.282802 | 692.630550 | 696.630660 | 0.325690 |
std | 0.044773 | 7.467409e+04 | 8.413049 | 0.905007 | 0.952044 | 0.661468 | 21569.939271 | 11.883834 | 0.150321 | 0.083585 | 2042.770881 | 1.503326e+05 | 0.110079 | 893.304366 | 0.468027 | 0.247232 | 0.468839 | 0.285336 | 0.27219 | 0.236522 | 0.238799 | 0.210868 | 0.208006 | 0.208813 | 0.192569 | 0.264278 | 0.248999 | 0.018163 | 0.498853 | 0.008366 | 0.007071 | 0.313985 | 0.493968 | 0.467169 | 0.487797 | 0.452833 | 0.097609 | 0.418712 | 0.493097 | 0.0 | 0.238228 | 0.065054 | 0.139417 | 0.10435 | 0.085186 | 0.234122 | 0.025487 | 0.105468 | 0.083786 | 0.029652 | 0.495684 | 0.495684 | 0.499888 | 0.222753 | 0.237655 | 0.208835 | 0.219488 | 0.214947 | 0.220691 | 0.224891 | 0.215786 | 0.218008 | 0.216828 | 8464.825314 | 8461.694483 | 1.489496e+01 | 0.351173 | 5.458774 | 265.921746 | 0.239373 | 18.053290 | 18.051746 | 2706.262200 | 29.668017 | 29.668584 | 0.468634 |
min | 0.053200 | 5.360000e+03 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1000.000000 | 800.000000 | -2.000000e-09 | 0.000000 | 1.000000 | 23.360000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 660.000000 | 664.000000 | 0.000000 |
25% | 0.097500 | 4.500000e+04 | 12.200000 | 0.000000 | 0.000000 | 0.000000 | 6009.000000 | 16.000000 | 0.000000 | 0.000000 | 0.000000 | 2.698900e+04 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 7200.000000 | 7200.000000 | 0.000000e+00 | 1.000000 | 8.000000 | 240.292500 | 0.361000 | 0.000000 | 0.000000 | 857.292500 | 670.000000 | 674.000000 | 0.000000 |
50% | 0.127400 | 6.200000e+04 | 18.060000 | 0.000000 | 0.000000 | 0.000000 | 11030.500000 | 23.000000 | 0.000000 | 0.000000 | 0.000000 | 6.802350e+04 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 12000.000000 | 12000.000000 | 0.000000e+00 | 1.000000 | 11.000000 | 366.370000 | 0.541000 | 0.000000 | 0.000000 | 1615.160000 | 685.000000 | 689.000000 | 0.000000 |
75% | 0.158000 | 9.000000e+04 | 24.530000 | 0.000000 | 1.000000 | 0.000000 | 19540.000000 | 31.000000 | 0.000000 | 0.000000 | 0.000000 | 1.943098e+05 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 19200.000000 | 19200.000000 | 0.000000e+00 | 1.000000 | 14.000000 | 575.860000 | 0.720000 | 0.000000 | 0.000000 | 3039.115000 | 705.000000 | 709.000000 | 1.000000 |
max | 0.309900 | 8.300000e+06 | 49.930000 | 20.000000 | 6.000000 | 63.000000 | 971736.000000 | 176.000000 | 5.000000 | 6.000000 | 197765.000000 | 3.164353e+06 | 7.000000 | 94521.000000 | 63.000000 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 1.000000 | 1.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 40000.000000 | 40000.000000 | 8.741700e+02 | 1.000000 | 82.000000 | 1584.900000 | 8.923000 | 2330.970000 | 2330.970000 | 28005.960000 | 845.000000 | 850.000000 | 1.000000 |
# 결측치 확인
pd.DataFrame(df.isnull().sum()).rename(columns={0:'Non-Null Count'}).T
int_rate | annual_inc | dti | delinq_2yrs | inq_last_6mths | pub_rec | revol_bal | total_acc | collections_12_mths_ex_med | acc_now_delinq | tot_coll_amt | tot_cur_bal | chargeoff_within_12_mths | delinq_amnt | tax_liens | emp_length1 | emp_length2 | emp_length3 | emp_length4 | emp_length5 | emp_length6 | emp_length7 | emp_length8 | emp_length9 | emp_length10 | emp_length11 | emp_length12 | home_ownership1 | home_ownership2 | home_ownership3 | home_ownership4 | home_ownership5 | home_ownership6 | verification_status1 | verification_status2 | verification_status3 | purpose1 | purpose2 | purpose3 | purpose4 | purpose5 | purpose6 | purpose7 | purpose8 | purpose9 | purpose10 | purpose11 | purpose12 | purpose13 | purpose14 | initial_list_status1 | initial_list_status2 | mths_since_last_delinq1 | mths_since_last_delinq2 | mths_since_last_delinq3 | mths_since_last_delinq4 | mths_since_last_delinq5 | mths_since_last_delinq6 | mths_since_last_delinq7 | mths_since_last_delinq8 | mths_since_last_delinq9 | mths_since_last_delinq10 | mths_since_last_delinq11 | funded_amnt | funded_amnt_inv | total_rec_late_fee | term1 | open_acc | installment | revol_util | out_prncp | out_prncp_inv | total_rec_int | fico_range_low | fico_range_high | depvar | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Non-Null Count | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Feature engineering을 잘했다고 느껴진게 여기서 드러난다. 결측값(NaN)이 한개도 없었다.
아무래도 모델 튜닝에 신경쓰라는 의도인것 같았다.
# 라벨(depvar) 비율 확인
print('y=1 ratio :', df.depvar.sum()/len(df))
y=1 ratio : 0.32569
f1 score와 y=1 ratio, 마지막으로 macro f1
이 세 가지가 문제를 푸는데 정말 고민하게 했다… 끝에서 얘기를 하겠습니다.
Train Valid Split
어떤 방식으로 문제를 접근했냐면, XGBClassifier라는 모델을 선정했고,
이 모델을 학습시킬때, 총 100,000개의 데이터들을 5등분하여,
4개씩 트레이닝 하는 방법으로 진행하였습니다.
예를 들면,
dataset1, dataset2, dataset3, dataset4, dataset5 이렇게 각각 2만개의 데이터를 받고,
model1 = 1,2,3,4 (5번제외)
model2 = 1,2,3,5 (4번제외)
model3 = 1,2,4,5 (3번제외)
model4 = 1,3,4,5 (2번제외)
model5 = 2,3,4,5 (1번제외)
이런식으로 진행하여 최종 test_dataset에서는,
out1 = model1(test)
out2 = model2(test)
out3 = model3(test)
out4 = model4(test)
out5 = model5(test)
제출용 csv파일에는 5개의 모델중에서 동일한 class가 3개이상이면 해당 class로 제출하는방식을 택했습니다!!!!
(아직 못해본게 많이 있습니다. SMOTE를 통한 데이터 증식, RandomForest model을 통한 학습 등 고려할게 많았지만,
제가 시간을 삽질하는데에 허비해버려서, 대회 마감일에 허겁지겁 학습하게 되었습니다.
경험이 없어서, 모르는 용어가 많았네요 ㅜㅜ)
X = df.drop('depvar', axis=1)
y = df['depvar']
X_train1 = X[:20000]
X_train2 = X[20000:40000]
X_train3 = X[40000:60000]
X_train4 = X[60000:80000]
X_train5 = X[80000:]
y_train1 = y[:20000]
y_train2 = y[20000:40000]
y_train3 = y[40000:60000]
y_train4 = y[60000:80000]
y_train5 = y[80000:]
##########
frames = [X_train1, X_train2, X_train3, X_train4]
X_train_dataset1 = pd.concat(frames)
frames = [X_train1, X_train2, X_train3, X_train5]
X_train_dataset2 = pd.concat(frames)
frames = [X_train1, X_train2, X_train4, X_train5]
X_train_dataset3 = pd.concat(frames)
frames = [X_train1, X_train3, X_train4, X_train5]
X_train_dataset4 = pd.concat(frames)
frames = [X_train2, X_train3, X_train4, X_train5]
X_train_dataset5 = pd.concat(frames)
##########
frames = [y_train1, y_train2, y_train3, y_train4]
y_train_dataset1 = pd.concat(frames)
frames = [y_train1, y_train2, y_train3, y_train5]
y_train_dataset2 = pd.concat(frames)
frames = [y_train1, y_train2, y_train4, y_train5]
y_train_dataset3 = pd.concat(frames)
frames = [y_train1, y_train3, y_train4, y_train5]
y_train_dataset4 = pd.concat(frames)
frames = [y_train2, y_train3, y_train4, y_train5]
y_train_dataset5 = pd.concat(frames)
# training set과 validation set의 데이터 수 확인
print(f"X_train1 = {len(X_train1)}")
print(f"X_train2 = {len(X_train2)}")
print(f"X_train3 = {len(X_train3)}")
print(f"X_train4 = {len(X_train4)}")
print(f"X_train5 = {len(X_train5)}")
print()
print(f"y_train1 = {len(y_train1)}")
print(f"y_train2 = {len(y_train2)}")
print(f"y_train3 = {len(y_train3)}")
print(f"y_train4 = {len(y_train4)}")
print(f"y_train5 = {len(y_train5)}")
print()
print(f"y_train1의 label ratio check = {y_train1.sum() / len(y_train1)}")
print(f"y_train2의 label ratio check = {y_train2.sum() / len(y_train2)}")
print(f"y_train3의 label ratio check = {y_train3.sum() / len(y_train3)}")
print(f"y_train4의 label ratio check = {y_train4.sum() / len(y_train4)}")
print(f"y_train5의 label ratio check = {y_train5.sum() / len(y_train5)}")
X_train1 = 20000 X_train2 = 20000 X_train3 = 20000 X_train4 = 20000 X_train5 = 20000 y_train1 = 20000 y_train2 = 20000 y_train3 = 20000 y_train4 = 20000 y_train5 = 20000 y_train1의 label ratio check = 0.3256 y_train2의 label ratio check = 0.3226 y_train3의 label ratio check = 0.3293 y_train4의 label ratio check = 0.3329 y_train5의 label ratio check = 0.31805
중요한게 위에서 나왔군요..
제가 끝까지 고민하려 했지만, 한계를 느껴서, 이 포스팅을 마치고 공부하러 떠납니다…
(정말 모르는 용어가 너무 많고, 아직 많이 부족함을 느껴 공부를 더 해야합니다.)
끝에서도 말하겠지만, 제가 제출하려고 하는 최종 결과물의 핵심은 아래와 같습니다.
-
train_dataset에서의 y=1 ratio 비율이 제출하려는 test_dataset에서의 비율과 거의 같아야 합니다.
-
이번 대회의 평가 지표인 macro f1의 값이 높아야 합니다.
-
당연한 소리이지만 올바른 class를 많이 맞추어야 합니다.
이게 너무 어려웠습니다.
아직 포스팅을 작성하고 있는 시점이라서, 해결은 못했지만…
어떤 현상이 벌어지냐면요..
fi score가 높으면, y=1 ratio의 값이 너무나도 튀게 됩니다.
ex) f1 score 0.666, y=1 ratio 0.5
f1 score가 precision과 recall에 영향을 받고,
Macro f1은 f1 score에 영향을 받습니다.
그러면 precision과 recall의 값이 높으면 되는데, 단순히 이 값만을 높이게 되면,
y=1 ratio가 튀게되어서 결과적으로 leaderboard에서도 낮은 점수를 기록하게 되었습니다.
(참고로 y=1 ratio가 튀게되어서 점수가 낮아진다고 추측하고있는것 뿐입니다.)
밑에서마저 얘기해보겠습니다.
X_train_dataset4
int_rate | annual_inc | dti | delinq_2yrs | inq_last_6mths | pub_rec | revol_bal | total_acc | collections_12_mths_ex_med | acc_now_delinq | tot_coll_amt | tot_cur_bal | chargeoff_within_12_mths | delinq_amnt | tax_liens | emp_length1 | emp_length2 | emp_length3 | emp_length4 | emp_length5 | emp_length6 | emp_length7 | emp_length8 | emp_length9 | emp_length10 | emp_length11 | emp_length12 | home_ownership1 | home_ownership2 | home_ownership3 | home_ownership4 | home_ownership5 | home_ownership6 | verification_status1 | verification_status2 | verification_status3 | purpose1 | purpose2 | purpose3 | purpose4 | purpose5 | purpose6 | purpose7 | purpose8 | purpose9 | purpose10 | purpose11 | purpose12 | purpose13 | purpose14 | initial_list_status1 | initial_list_status2 | mths_since_last_delinq1 | mths_since_last_delinq2 | mths_since_last_delinq3 | mths_since_last_delinq4 | mths_since_last_delinq5 | mths_since_last_delinq6 | mths_since_last_delinq7 | mths_since_last_delinq8 | mths_since_last_delinq9 | mths_since_last_delinq10 | mths_since_last_delinq11 | funded_amnt | funded_amnt_inv | total_rec_late_fee | term1 | open_acc | installment | revol_util | out_prncp | out_prncp_inv | total_rec_int | fico_range_low | fico_range_high | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0824 | 21000.0 | 29.19 | 0 | 1 | 0 | 3016 | 26 | 0 | 0 | 0 | 11773 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1200 | 1200.0 | 0.0 | 1 | 18 | 37.74 | 0.076 | 0.0 | 0.0 | 157.94 | 765 | 769 |
1 | 0.1299 | 80000.0 | 4.82 | 0 | 1 | 1 | 5722 | 24 | 0 | 0 | 0 | 21875 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8000 | 8000.0 | 0.0 | 1 | 8 | 269.52 | 0.447 | 0.0 | 0.0 | 1702.42 | 665 | 669 |
2 | 0.1299 | 38000.0 | 23.66 | 0 | 3 | 0 | 6511 | 18 | 0 | 0 | 0 | 31868 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5000 | 5000.0 | 0.0 | 1 | 7 | 168.45 | 0.880 | 0.0 | 0.0 | 1066.64 | 670 | 674 |
3 | 0.1367 | 100000.0 | 16.27 | 4 | 2 | 0 | 6849 | 30 | 0 | 0 | 0 | 326049 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 15000 | 15000.0 | 0.0 | 1 | 12 | 510.27 | 0.457 | 0.0 | 0.0 | 1256.24 | 680 | 684 |
4 | 0.1269 | 30000.0 | 25.28 | 0 | 1 | 2 | 8197 | 12 | 0 | 0 | 2506 | 8840 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10000 | 10000.0 | 0.0 | 1 | 8 | 335.45 | 0.416 | 0.0 | 0.0 | 871.04 | 660 | 664 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99995 | 0.1757 | 65000.0 | 17.67 | 0 | 3 | 1 | 11255 | 21 | 1 | 0 | 0 | 26570 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20000 | 20000.0 | 0.0 | 1 | 13 | 718.75 | 0.780 | 0.0 | 0.0 | 5373.29 | 660 | 664 |
99996 | 0.0890 | 65000.0 | 2.88 | 0 | 0 | 0 | 2105 | 12 | 0 | 0 | 0 | 6138 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6000 | 6000.0 | 0.0 | 1 | 7 | 190.52 | 0.120 | 0.0 | 0.0 | 835.66 | 765 | 769 |
99997 | 0.1349 | 46000.0 | 32.12 | 0 | 1 | 0 | 8998 | 20 | 0 | 0 | 0 | 96531 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6400 | 6400.0 | 0.0 | 1 | 19 | 217.16 | 0.643 | 0.0 | 0.0 | 1261.67 | 665 | 669 |
99998 | 0.2115 | 31000.0 | 4.53 | 0 | 1 | 0 | 3875 | 4 | 0 | 0 | 0 | 3875 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5500 | 5500.0 | 0.0 | 1 | 3 | 207.64 | 0.731 | 0.0 | 0.0 | 1357.69 | 710 | 714 |
99999 | 0.1599 | 125000.0 | 33.33 | 0 | 0 | 0 | 34580 | 30 | 0 | 0 | 0 | 422626 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 33125 | 33125.0 | 0.0 | 1 | 19 | 1164.42 | 0.499 | 0.0 | 0.0 | 8882.58 | 690 | 694 |
80000 rows × 75 columns
Single Model(XGBoost)
XGBClassifier의 하이퍼 파라미터 설명
-
LINK : https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier
-
max_depth=3 : 디시전 트리의 최대 깊이
-
learning_rate=0.1 : 0과 1 사이의 값을 가지는 부스팅에 대한 학습률(eta). 매 부스팅 단계 이후 새로이 추가된 가중치는 이 파라미터로 조정된다. 이 값이 낮을수록 보수적이며, 수렴에 필요한 더 많은 디시전 트리가 필요하다.
-
n_estimators=100 : 라운드의 횟수 또는 부스팅된 디시전 트리의 개수
-
silent=True : 부스팅의 수행 동안 메시지를 출력할지에 대한 여부
-
objective=”reg:linear” : 모델이 학습할 작업의 분류, 미리 정의된 작업은 문자열로 지정이 가능하지만, 그렇지 않은 경우 콜러블callable을 만들어서 지정할 수도 있다.
-
booster=”gbtree” : ‘gbtree’, ‘gblinear’, ‘dart’일 수 있다. ‘dart’는 드롭아웃(dropout)이라는 특성을 추가한다(과적합 방지를 위해 무작위로 디시전 트리를 선택해 제거(드롭)한다). ‘gblinear’는 정규화된 선형 모델을 만든다(디시전 트리가 아니라 라소 회귀와 유사하다).
-
nthread=None : 더 이상 사용되지 않는다.
-
n_jobs : 사용할 스레드의 개수
-
gamma=0 : 노드 분할에 필요한 최소 손실 감소
-
min_child_weight=1 : 자식 노드 생성에 필요한 헤시안(hessian) 합의 최솟값
-
max_delta_step=0 : 보다 보수적으로 갱신을 수행하도록 만드는 값. 불균형 범주의 데이터셋에 대해서는 1부터 10까지의 값으로 설정한다.
-
subsample=1 : 부스팅에 사용할 샘플의 비율
-
colsample_bytree=1 : 부스팅에 사용할 특징 열의 비율
-
colsample_bylevel=1 : 각 디시전 트리의 수준별 사용할 특징 열의 비율
-
colsample_bynode=1 : 각 디시전 트리의 노드별 사용할 특징 열의 비율
-
reg_alpha=0 : L1 정규화(가중치의 평균). 이 값이 클수록 보수적이게 된다.
-
reg_lambda=1 : L2 정규화(가중치의 제곱근). 이 값이 클수록 보수적이게 된다.
-
base_score=.5 : 초기 편향치(bias)
-
seed=None : 더 이상 사용되지 않는다.
-
random_state=0 : 난수 생성 시드
-
missing=None : 누락된 데이터가 해석될 값. None은 np.nan을 의미한다.
-
importance_type=’gain’ : 특징 중요도의 유형. ‘gain’, ‘weight’, ‘cover’, ‘total_gain’, ‘total_cover’로 설정될 수 있다.
# # 하이퍼 파라미터 튜닝
# xgb_clf = xgboost.XGBClassifier()
# param_grid = {'max_depth':[5,10],
# 'n_estimators':[50, 100],
# }
# cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
# grid_search = GridSearchCV(estimator=xgb_clf,
# param_grid=param_grid,
# n_jobs=-1,
# cv=cv,
# scoring='accuracy',
# error_score=0)
# results=grid_search.fit(X_train_dataset1, y_train_dataset1)
# results.best_params_
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_valid = sc.transform(X_valid)
XGBoost hyperparameter tuning
hyperparameter tuning을 진행하면서, plot이 사라진게 있습니다.
시간이 오래걸려서 중간에 멈춰버린게 많았거든요.ㅜㅜ
그래도 값들은 직접 구해본것들입니다.!!!
이 부분은 “튜닝을 이런식으로 했구나” 정도만 보고 지나가셔도 될것같습니다.
tuning_dataset = df[:80000]
tuning_testset = df[80000:]
idx = [ i for i in range(80000)]
tuning_dataset["Id"] = idx
tuning_dataset
int_rate | annual_inc | dti | delinq_2yrs | inq_last_6mths | pub_rec | revol_bal | total_acc | collections_12_mths_ex_med | acc_now_delinq | tot_coll_amt | tot_cur_bal | chargeoff_within_12_mths | delinq_amnt | tax_liens | emp_length1 | emp_length2 | emp_length3 | emp_length4 | emp_length5 | emp_length6 | emp_length7 | emp_length8 | emp_length9 | emp_length10 | emp_length11 | emp_length12 | home_ownership1 | home_ownership2 | home_ownership3 | home_ownership4 | home_ownership5 | home_ownership6 | verification_status1 | verification_status2 | verification_status3 | purpose1 | purpose2 | purpose3 | purpose4 | purpose5 | purpose6 | purpose7 | purpose8 | purpose9 | purpose10 | purpose11 | purpose12 | purpose13 | purpose14 | initial_list_status1 | initial_list_status2 | mths_since_last_delinq1 | mths_since_last_delinq2 | mths_since_last_delinq3 | mths_since_last_delinq4 | mths_since_last_delinq5 | mths_since_last_delinq6 | mths_since_last_delinq7 | mths_since_last_delinq8 | mths_since_last_delinq9 | mths_since_last_delinq10 | mths_since_last_delinq11 | funded_amnt | funded_amnt_inv | total_rec_late_fee | term1 | open_acc | installment | revol_util | out_prncp | out_prncp_inv | total_rec_int | fico_range_low | fico_range_high | depvar | Id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0824 | 21000.0 | 29.19 | 0 | 1 | 0 | 3016 | 26 | 0 | 0 | 0 | 11773 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1200 | 1200.0 | 0.0 | 1 | 18 | 37.74 | 0.076 | 0.0 | 0.0 | 157.94 | 765 | 769 | 0 | 0 |
1 | 0.1299 | 80000.0 | 4.82 | 0 | 1 | 1 | 5722 | 24 | 0 | 0 | 0 | 21875 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8000 | 8000.0 | 0.0 | 1 | 8 | 269.52 | 0.447 | 0.0 | 0.0 | 1702.42 | 665 | 669 | 0 | 1 |
2 | 0.1299 | 38000.0 | 23.66 | 0 | 3 | 0 | 6511 | 18 | 0 | 0 | 0 | 31868 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5000 | 5000.0 | 0.0 | 1 | 7 | 168.45 | 0.880 | 0.0 | 0.0 | 1066.64 | 670 | 674 | 0 | 2 |
3 | 0.1367 | 100000.0 | 16.27 | 4 | 2 | 0 | 6849 | 30 | 0 | 0 | 0 | 326049 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 15000 | 15000.0 | 0.0 | 1 | 12 | 510.27 | 0.457 | 0.0 | 0.0 | 1256.24 | 680 | 684 | 1 | 3 |
4 | 0.1269 | 30000.0 | 25.28 | 0 | 1 | 2 | 8197 | 12 | 0 | 0 | 2506 | 8840 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10000 | 10000.0 | 0.0 | 1 | 8 | 335.45 | 0.416 | 0.0 | 0.0 | 871.04 | 660 | 664 | 1 | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
79995 | 0.1333 | 114480.0 | 7.81 | 0 | 1 | 1 | 5680 | 42 | 0 | 0 | 108 | 340793 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 10800 | 10800.0 | 0.0 | 1 | 18 | 365.62 | 0.458 | 0.0 | 0.0 | 1868.28 | 670 | 674 | 0 | 79995 |
79996 | 0.0699 | 88800.0 | 7.91 | 0 | 0 | 0 | 9617 | 11 | 0 | 0 | 0 | 230979 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9800 | 9800.0 | 0.0 | 1 | 4 | 302.56 | 0.641 | 0.0 | 0.0 | 0.00 | 725 | 729 | 1 | 79996 |
79997 | 0.1299 | 32000.0 | 5.96 | 0 | 1 | 0 | 952 | 6 | 0 | 0 | 0 | 4410 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5600 | 5600.0 | 0.0 | 1 | 4 | 188.66 | 0.090 | 0.0 | 0.0 | 1188.09 | 745 | 749 | 0 | 79997 |
79998 | 0.1333 | 60000.0 | 18.58 | 0 | 0 | 0 | 31287 | 14 | 0 | 0 | 0 | 35108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 27600 | 27600.0 | 0.0 | 1 | 7 | 934.35 | 0.711 | 0.0 | 0.0 | 6015.99 | 705 | 709 | 0 | 79998 |
79999 | 0.2099 | 40000.0 | 22.92 | 0 | 0 | 0 | 2777 | 22 | 0 | 0 | 0 | 23758 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11125 | 11125.0 | 0.0 | 0 | 10 | 300.91 | 0.237 | 0.0 | 0.0 | 6743.82 | 705 | 709 | 0 | 79999 |
80000 rows × 77 columns
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=100):
# get new n_estimator
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgboost.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='error', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
print(alg)
# Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['depvar'], eval_metric='error')
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Print model report:
print("\nModel Report")
print("Training Accuracy : %.4g" % metrics.accuracy_score(dtrain['depvar'].values, dtrain_predictions))
target = 'depvar'
IDcol = 'Id'
# n_estimators를 1000으로 두면 왜 training accuracy가 더 떨어질까? 100보다.
predictors = [x for x in tuning_dataset.columns if x not in [target, IDcol]]
model1 = xgboost.XGBClassifier(learning_rate = 0.01,
n_estimators = 1000,
max_depth = 5,
min_child_weight = 1,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205
)
modelfit(model1, tuning_dataset, predictors)
n_estimators fix
1.0.2
## GPU 쓰고 싶다.
# tree_method='gpu_hist',
# predictor='gpu_predictor'
cv_scores = list()
estimator_list = [i for i in range(100, 1000, 50)]
for i in tqdm(range(100, 1000, 50)):
xgbc = xgboost.XGBClassifier(learning_rate = 0.01,
n_estimators = i,
max_depth = 5,
min_child_weight = 1,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205,
tree_method='gpu_hist',
predictor='gpu_predictor'
)
score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="f1_macro").mean()
cv_scores.append(score)
best_e = [estimator_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]
plt.figure(figsize=(20,10))
plt.plot(estimator_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)
print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 350
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")
Max depth
cv_scores = list()
max_depth_list = [5, 7, 9]
for i in tqdm(max_depth_list):
xgbc = xgboost.XGBClassifier(learning_rate = 0.01,
n_estimators = 350,
max_depth = i,
min_child_weight = 1,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205,
tree_method='gpu_hist',
predictor='gpu_predictor')
score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="f1_macro").mean()
cv_scores.append(score)
best_e = [max_depth_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]
plt.figure(figsize=(20,10))
plt.plot(max_depth_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)
print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 9
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")
gamma
cv_scores = list()
gamma_list = [0.0, 0.1, 0.2, 0.3, 0.4]
for i in tqdm(gamma_list):
xgbc = xgboost.XGBClassifier(learning_rate = 0.01,
n_estimators = 350,
max_depth = 9,
min_child_weight = 1,
gamma = i,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205,
eval_metric='error',
tree_method='gpu_hist',
predictor='gpu_predictor')
score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="accuracy").mean()
cv_scores.append(score)
best_e = [gamma_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]
100%|██████████| 5/5 [04:19<00:00, 51.92s/it]
plt.figure(figsize=(20,10))
plt.plot(gamma_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)
<matplotlib.lines.Line2D at 0x7f5c64528c90>
print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
#
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")
optimizer tree count : 0 Train set에 대한 성능 : 0.7508
min_child_weight
cv_scores = list()
min_child_weight_list = [i for i in range(1,10,2)]
for i in tqdm(range(1,10,2)):
xgbc = xgboost.XGBClassifier(learning_rate = 0.1,
n_estimators = 350,
max_depth = 5,
min_child_weight = i,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205,
eval_metric='error',
tree_method='gpu_hist',
predictor='gpu_predictor')
score = cross_val_score(xgbc, X_train_dataset1, y_train_dataset1, cv=5, scoring="accuracy").mean()
cv_scores.append(score)
best_e = [min_child_weight_list[i] for i in range(len(cv_scores)) if cv_scores[i] == np.max(cv_scores)]
100%|██████████| 5/5 [01:20<00:00, 16.01s/it]
plt.figure(figsize=(20,10))
plt.plot(min_child_weight_list, cv_scores, marker='o', linestyle='dashed')
plt.axvline(best_e[0], color='r', linestyle= '--', linewidth=2)
<matplotlib.lines.Line2D at 0x7f5c4b3b71d0>
print(f"optimizer tree count : {(cv_scores.index(max(cv_scores)))} ")
# 1
print(f"Train set에 대한 성능 : {max(cv_scores):.4f} ")
optimizer tree count : 0 Train set에 대한 성능 : 0.7541
XGBoost Model Training
위에서 얻어진 값들은 모델의 학습 파라미터로 주고 있습니다.
tree_method='gpu_hist',
predictor='gpu_predictor'
이 부분은 gpu로 학습시킨다는 의미이며, gpu가 없을시 저 부분을 삭제하고 진행하시면 됩니다.
Train Valid Split 에서 설명한대로 학습을 진행하고 있습니다.
model1 = xgboost.XGBClassifier(learning_rate = 0.01,
n_estimators = 350,
max_depth = 9,
min_child_weight = 1,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205,
tree_method='gpu_hist',
predictor='gpu_predictor')
model2 = xgboost.XGBClassifier(learning_rate = 0.01,
n_estimators = 1000,
max_depth = 5,
min_child_weight = 10,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205,
tree_method='gpu_hist',
predictor='gpu_predictor')
model3 = xgboost.XGBClassifier(learning_rate = 0.01,
n_estimators = 1000,
max_depth = 5,
min_child_weight = 10,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205,
tree_method='gpu_hist',
predictor='gpu_predictor')
model4 = xgboost.XGBClassifier(learning_rate = 0.01,
n_estimators = 1000,
max_depth = 5,
min_child_weight = 10,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205,
tree_method='gpu_hist',
predictor='gpu_predictor')
model5 = xgboost.XGBClassifier(learning_rate = 0.01,
n_estimators = 1000,
max_depth = 5,
min_child_weight = 10,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.8,
objective="binary:logistic",
nthread= -1,
scale_pos_weight = 1,
seed=220205,
tree_method='gpu_hist',
predictor='gpu_predictor')
model1.fit(X_train_dataset1, y_train_dataset1)
model2.fit(X_train_dataset2, y_train_dataset2)
model3.fit(X_train_dataset3, y_train_dataset3)
model4.fit(X_train_dataset4, y_train_dataset4)
model5.fit(X_train_dataset5, y_train_dataset5)
XGBClassifier(colsample_bytree=0.8, learning_rate=0.01, max_depth=5, min_child_weight=10, n_estimators=1000, nthread=-1, predictor='gpu_predictor', seed=220205, subsample=0.8, tree_method='gpu_hist')
y_pred1 = model1.predict(X_train5)
y_pred2 = model2.predict(X_train4)
y_pred3 = model3.predict(X_train3)
y_pred4 = model4.predict(X_train2)
y_pred5 = model5.predict(X_train1)
# y_valid가 0 또는 1일 확률 출력
y_prob1 = model1.predict_proba(X_train5)
y_prob2 = model2.predict_proba(X_train4)
y_prob3 = model3.predict_proba(X_train3)
y_prob4 = model4.predict_proba(X_train2)
y_prob5 = model5.predict_proba(X_train1)
# 1로 예측된 y_valid 갯수 및 비율 출력
print("thr > 0.5")
print(f" model1의 예측비율 : {y_pred1.sum() / len(y_pred1)}")
print(f" model2의 예측비율 : {y_pred2.sum() / len(y_pred2)}")
print(f" model3의 예측비율 : {y_pred3.sum() / len(y_pred3)}")
print(f" model4의 예측비율 : {y_pred4.sum() / len(y_pred4)}")
print(f" model5의 예측비율 : {y_pred5.sum() / len(y_pred5)}")
# thr 0.3보다 큰 경우
thr = 0.38
print(f"thr > {thr}")
pred1 = (y_prob1[:,1] >= thr).astype(np.int64)
pred2 = (y_prob2[:,1] >= thr).astype(np.int64)
pred3 = (y_prob3[:,1] >= thr).astype(np.int64)
pred4 = (y_prob4[:,1] >= thr).astype(np.int64)
pred5 = (y_prob5[:,1] >= thr).astype(np.int64)
print(f" model1의 예측비율 : {pred1.sum() / len(pred1)}")
print(f" model1의 예측비율 : {pred2.sum() / len(pred2)}")
print(f" model1의 예측비율 : {pred3.sum() / len(pred3)}")
print(f" model1의 예측비율 : {pred4.sum() / len(pred4)}")
print(f" model1의 예측비율 : {pred5.sum() / len(pred5)}")
thr > 0.5 model1의 예측비율 : 0.2039 model2의 예측비율 : 0.2112 model3의 예측비율 : 0.2054 model4의 예측비율 : 0.21025 model5의 예측비율 : 0.2036 thr > 0.38 model1의 예측비율 : 0.3277 model1의 예측비율 : 0.3358 model1의 예측비율 : 0.32705 model1의 예측비율 : 0.33205 model1의 예측비율 : 0.32595
여기에서 보면 predict()로 구하게 되면 thr값이 0.5가 default여서 y=1 ratio의 값이 train_dataset과 차이가 많이 나게 됩니다.
그래서 predict_proba()로 구해서 thr의 값을 수정해주는 작업을 진행해 주어야합니다.
사실 thr 값을 제가 이렇게 바꾸는게, 기껏 모델 학습시켜놓은것을 망친다는 생각도 했으며,
모델을 학습시킬대 thr값을 제가 변경해주면서 학습시키는것이 가능한가?
이 부분에 대한 공부도 필요합니다.
docs도 뒤져보고 있는데, 능력부족으로 파악하지 못했습니다.
직접 구현해야 하는것인지 ㅜㅜ
# 평가 함수 정의
def get_clf_eval(y_actual, y_pred):
accuracy = accuracy_score(y_actual, y_pred)
precision = precision_score(y_actual, y_pred)
recall = recall_score(y_actual, y_pred)
AUC = roc_auc_score(y_actual, y_pred)
F1 = f1_score(y_actual, y_pred)
print('\naccuracy: {:.4f}'.format(accuracy))
print('precision: {:.4f}'.format(precision))
print('recall: {:.4f}'.format(recall))
print('AUC: {:.4f}'.format(AUC))
print('F1: {:.4f}'.format(F1))
# sns.heatmap(confusion_matrix(y_actual, y_pred), annot=True, fmt='d', cmap='YlGnBu')
# xgboost 성능 확인
get_clf_eval(y_train5, pred1)
get_clf_eval(y_train4, pred2)
get_clf_eval(y_train3, pred3)
get_clf_eval(y_train2, pred4)
get_clf_eval(y_train1, pred5)
accuracy: 0.7500 precision: 0.6038 recall: 0.6221 AUC: 0.7158 F1: 0.6128 accuracy: 0.7428 precision: 0.6127 recall: 0.6181 AUC: 0.7116 F1: 0.6154 accuracy: 0.7459 precision: 0.6149 recall: 0.6107 AUC: 0.7115 F1: 0.6128 accuracy: 0.7456 precision: 0.6026 recall: 0.6203 AUC: 0.7127 F1: 0.6113 accuracy: 0.7435 precision: 0.6059 recall: 0.6066 AUC: 0.7081 F1: 0.6062
만족스러운 결과물은 아닙니다…
MNIST나 sklearn에서 제공하는 다른 데이터들에서는 항상 학습시키면 90이라는 숫자가 넘게 나왔는데,
train_dataset에서 학습을 시키는데도 불구하고 80을 못넘겼습니다…
test에서는 얼마나 더 떨어질지 ㅠㅠ
Randomforest hyperparameter tuning
Randomforest Model Training
Submission
# 제출 양식 다운로드
submit = pd.read_csv('/content/drive/MyDrive/22-01-28/sample_submission.csv')
# prediction 수행
df_test = pd.read_csv('/content/drive/MyDrive/22-01-28/test.csv')
test = df_test.drop('ID', axis=1)
test
int_rate | annual_inc | dti | delinq_2yrs | inq_last_6mths | pub_rec | revol_bal | total_acc | collections_12_mths_ex_med | acc_now_delinq | tot_coll_amt | tot_cur_bal | chargeoff_within_12_mths | delinq_amnt | tax_liens | emp_length1 | emp_length2 | emp_length3 | emp_length4 | emp_length5 | emp_length6 | emp_length7 | emp_length8 | emp_length9 | emp_length10 | emp_length11 | emp_length12 | home_ownership1 | home_ownership2 | home_ownership3 | home_ownership4 | home_ownership5 | home_ownership6 | verification_status1 | verification_status2 | verification_status3 | purpose1 | purpose2 | purpose3 | purpose4 | purpose5 | purpose6 | purpose7 | purpose8 | purpose9 | purpose10 | purpose11 | purpose12 | purpose13 | purpose14 | initial_list_status1 | initial_list_status2 | mths_since_last_delinq1 | mths_since_last_delinq2 | mths_since_last_delinq3 | mths_since_last_delinq4 | mths_since_last_delinq5 | mths_since_last_delinq6 | mths_since_last_delinq7 | mths_since_last_delinq8 | mths_since_last_delinq9 | mths_since_last_delinq10 | mths_since_last_delinq11 | funded_amnt | funded_amnt_inv | total_rec_late_fee | term1 | open_acc | installment | revol_util | out_prncp | out_prncp_inv | total_rec_int | fico_range_low | fico_range_high | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.1449 | 16380.0 | 26.08 | 0 | 0 | 1 | 3486 | 10 | 0 | 0 | 0 | 9214 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8100 | 8100.0 | 0.00 | 1 | 4 | 278.78 | 0.311 | 0.0 | 0.0 | 460.40 | 700 | 704 |
1 | 0.1899 | 65000.0 | 13.97 | 0 | 0 | 0 | 25305 | 20 | 0 | 0 | 0 | 115612 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 20000 | 20000.0 | 0.00 | 0 | 10 | 518.71 | 0.885 | 0.0 | 0.0 | 4866.68 | 675 | 679 |
2 | 0.1049 | 53000.0 | 23.28 | 0 | 0 | 0 | 10910 | 21 | 0 | 0 | 0 | 33017 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 10000 | 10000.0 | 16.25 | 1 | 7 | 324.98 | 0.580 | 0.0 | 0.0 | 1451.06 | 675 | 679 |
3 | 0.1757 | 71800.0 | 30.32 | 0 | 0 | 0 | 42423 | 26 | 0 | 0 | 0 | 152515 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27200 | 27200.0 | 0.00 | 0 | 16 | 684.36 | 0.701 | 0.0 | 0.0 | 7068.11 | 665 | 669 |
4 | 0.2020 | 50000.0 | 25.61 | 0 | 2 | 0 | 21703 | 24 | 0 | 0 | 0 | 135282 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22000 | 22000.0 | 0.00 | 0 | 13 | 585.32 | 0.622 | 0.0 | 0.0 | 7754.20 | 710 | 714 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
35811 | 0.1049 | 110000.0 | 9.02 | 0 | 0 | 1 | 8991 | 33 | 0 | 0 | 0 | 17468 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12000 | 12000.0 | 0.00 | 1 | 13 | 389.98 | 0.346 | 0.0 | 0.0 | 1406.13 | 665 | 669 |
35812 | 0.0824 | 45000.0 | 32.56 | 1 | 0 | 0 | 20966 | 29 | 0 | 0 | 0 | 24802 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5000 | 5000.0 | 0.00 | 1 | 9 | 157.24 | 0.690 | 0.0 | 0.0 | 495.70 | 705 | 709 |
35813 | 0.1368 | 49000.0 | 17.60 | 0 | 0 | 0 | 5597 | 7 | 0 | 0 | 0 | 20691 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6000 | 6000.0 | 0.00 | 1 | 7 | 204.14 | 0.333 | 0.0 | 0.0 | 1355.06 | 705 | 709 |
35814 | 0.1139 | 84852.0 | 9.96 | 0 | 0 | 1 | 7184 | 13 | 0 | 0 | 0 | 31396 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10000 | 10000.0 | 0.00 | 1 | 8 | 329.24 | 0.352 | 0.0 | 0.0 | 1615.15 | 725 | 729 |
35815 | 0.1274 | 60000.0 | 3.64 | 0 | 0 | 0 | 77 | 7 | 0 | 0 | 1468 | 178885 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1500 | 1500.0 | 0.00 | 1 | 3 | 50.36 | 0.770 | 0.0 | 0.0 | 311.08 | 675 | 679 |
35816 rows × 75 columns
# out1 = model1.predict(test)
# out2 = model2.predict(test)
# out3 = model3.predict(test)
# out4 = model4.predict(test)
# out5 = model5.predict(test)
prob1 = model1.predict_proba(test)
out1 = (prob1[:,1] >= thr).astype(np.int64)
prob2 = model2.predict_proba(test)
out2 = (prob2[:,1] >= thr).astype(np.int64)
prob3 = model3.predict_proba(test)
out3 = (prob3[:,1] >= thr).astype(np.int64)
prob4 = model4.predict_proba(test)
out4 = (prob4[:,1] >= thr).astype(np.int64)
prob5 = model5.predict_proba(test)
out5 = (prob5[:,1] >= thr).astype(np.int64)
print(out1.sum() / len(out1))
print(out2.sum() / len(out2))
print(out3.sum() / len(out3))
print(out4.sum() / len(out4))
print(out5.sum() / len(out5))
0.3358554835827563 0.334682823319187 0.332449184721912 0.3370002233638597 0.3345432209068573
result = []
for i in range(len(test)):
if 3 == (out1[i] + out2[i] + out3[i] + out4[i] + out5[i]):
result.append(1)
elif 4 == (out1[i] + out2[i] + out3[i] + out4[i] + out5[i]):
result.append(1)
elif 5 == (out1[i] + out2[i] + out3[i] + out4[i] + out5[i]):
result.append(1)
else:
result.append(0)
print(f" 최종 결과물!!! ")
print(f" {sum(result) / len(result)} ")
print(len(result))
print(result.count(1))
submit["answer"] = result
최종 결과물!!! 0.3345153004243913 35816 11981
# 제출 파일 저장
submit.to_csv('/content/drive/MyDrive/22-02-05/2022-02-05-14.csv', index=False)
y=1 ratio가 우선순위 1번이며,
f1 score가 우선순위 2번으로,
모델을 수정하고, 학습시켰습니다.
대회 마감일날, 최종 제출할때도 위의 2개의 원칙을 지켜서 제출해보려 합니다.(공부를 해서 뭔가를 알아내기 전까지는 그대로일듯 합니다.)
아래의 주석들은 제가 제출한 파일들의 점수와 hyperparameter들입니다.
끝으로..
공부할게 참 많습니다.!!!!
##########################################3
# CSV file 설명
# 2022-02-05-1 : hyperparameter tuning 하지 않았고, predict(thrhold:0.5)로만 하였음.
# 2022-02-05-2 : hyperparameter tuning 하지 않았고, predict(thrhold:0.42)로만 하였음.
# 2022-02-05-3 : hyperparameter tuning 한 결과, predict(thrhold:0.42)로만 하였음.(어떻게 튜닝했는데 더 떨어지지?)
# xgboost.XGBClassifier(learning_rate = 0.01,
# n_estimators = 350,
# max_depth = 9,
# min_child_weight = 1,
# gamma = 0,
# subsample = 0.8,
# colsample_bytree = 0.8,
# objective="binary:logistic",
# nthread= -1,
# scale_pos_weight = 1,
# seed=220205,
# tree_method='gpu_hist',
# predictor='gpu_predictor')
# 2022-02-05-4 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로만 하였음.(최고기록: 0.717169)
# 2022-02-05-5 : hyperparameter tuning 한 결과, predict(thrhold:0.35)로 하였음. 0.35로 했더니 ratio of label는 망가졌으나(0.39), fi score가 높음! 0.637~0.638
# 2022-02-05-6 : hyperparameter tuning 한 결과, predict(thrhold:0.35)로 하였음.
# score는 713982
# xgboost.XGBClassifier(learning_rate = 0.01,
# n_estimators = 1000,
# max_depth = 5,
# min_child_weight = 1,
# gamma = 0,
# subsample = 0.8,
# colsample_bytree = 0.8,
# objective="binary:logistic",
# nthread= -1,
# scale_pos_weight = 1,
# seed=220205,
# eval_metric='error',
# tree_method='gpu_hist',
# predictor='gpu_predictor')
# 2022-02-05-7 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로 하였음.
# score는 713330
# xgboost.XGBClassifier(learning_rate = 0.01,
# n_estimators = 1000,
# max_depth = 5,
# min_child_weight = 1,
# gamma = 0,
# subsample = 0.8,
# colsample_bytree = 0.8,
# objective="binary:logistic",
# nthread= -1,
# scale_pos_weight = 1,
# seed=220205,
# eval_metric='error',
# tree_method='gpu_hist',
# predictor='gpu_predictor')
# 2022-02-05-8 : hyperparameter tuning 한 결과, predict(thrhold:0.4)로 하였음.
# score는 713574
# xgboost.XGBClassifier(learning_rate = 0.01,
# n_estimators = 350,
# max_depth = 9,
# min_child_weight = 1,
# gamma = 0,
# subsample = 0.8,
# colsample_bytree = 0.8,
# objective="binary:logistic",
# nthread= -1,
# scale_pos_weight = 1,
# seed=220205,
# eval_metric='error',
# tree_method='gpu_hist',
# predictor='gpu_predictor')
# 2022-02-05-9 : hyperparameter tuning 한 결과, predict(thrhold:0.405)로 하였음.
# score는 712255
# xgboost.XGBClassifier(learning_rate = 0.01,
# n_estimators = 350,
# max_depth = 9,
# min_child_weight = 1,
# gamma = 0,
# subsample = 0.8,
# colsample_bytree = 0.8,
# objective="binary:logistic",
# nthread= -1,
# scale_pos_weight = 1,
# seed=220205,
# tree_method='gpu_hist',
# predictor='gpu_predictor')
# 2022-02-05-10 : hyperparameter tuning 한 결과, predict(thrhold:0.4)로 하였음.
# score는 713574
# xgboost.XGBClassifier(learning_rate = 0.01,
# n_estimators = 350,
# max_depth = 9,
# min_child_weight = 1,
# gamma = 0,
# subsample = 0.8,
# colsample_bytree = 0.8,
# objective="binary:logistic",
# nthread= -1,
# scale_pos_weight = 1,
# seed=220205,
# tree_method='gpu_hist',
# predictor='gpu_predictor')
# 2022-02-05-11 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로 하였음.
# score는 716175
# xgboost.XGBClassifier(learning_rate = 0.01,
# n_estimators = 350,
# max_depth = 9,
# min_child_weight = 1,
# gamma = 0,
# subsample = 0.8,
# colsample_bytree = 0.8,
# objective="binary:logistic",
# nthread= -1,
# scale_pos_weight = 1,
# seed=220205,
# tree_method='gpu_hist',
# predictor='gpu_predictor')
# 2022-02-05-12 : hyperparameter tuning 한 결과, predict(thrhold:0.36)로 하였음.
# score는 716011
# xgboost.XGBClassifier(learning_rate = 0.01,
# n_estimators = 350,
# max_depth = 9,
# min_child_weight = 1,
# gamma = 0,
# subsample = 0.8,
# colsample_bytree = 0.8,
# objective="binary:logistic",
# nthread= -1,
# scale_pos_weight = 1,
# seed=220205,
# tree_method='gpu_hist',
# predictor='gpu_predictor')
# 2022-02-05-13 : hyperparameter tuning 한 결과, predict(thrhold:0.38-0.4)로 하였음.
# score는 713264
# model1만 다르게 해본거.
# 2022-02-05-14 : hyperparameter tuning 한 결과, predict(thrhold:0.38)로 하였음.
# score는 713171
# xgboost.XGBClassifier(learning_rate = 0.01,
# n_estimators = 1000,
# max_depth = 5,
# min_child_weight = 10,
# gamma = 0,
# subsample = 0.8,
# colsample_bytree = 0.8,
# objective="binary:logistic",
# nthread= -1,
# scale_pos_weight = 1,
# seed=220205,
# tree_method='gpu_hist',
# predictor='gpu_predictor')
댓글남기기