In this notebook we will explore the three methods and compare their results with a multiple linear regression model applied to Boston Housing dataset. The target variable is price and the features are 10 polynomial features of LSTAT: % lower status of the population. LSTAT2= LSTAT2 , LSTAT3= LSTAT3 , and etc.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
sns . set () #if you want to use seaborn themes with matplotlib functions
import warnings
warnings . filterwarnings ( 'ignore' )
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
import pandas.util.testing as tm
from google.colab import drive
drive . mount ( '/content/drive' )
# 드라이브 마운트 csv 파일 경로 지정( 코랩에서 돌렸다. )
Mounted at /content/drive
df = pd . read_csv ( "/content/drive/MyDrive/PyTorch(YearDream)/2022-01-10_DAY15/Regularization_Boston.csv" )
price
LSTAT
LSTAT2
LSTAT3
LSTAT4
LSTAT5
LSTAT6
LSTAT7
LSTAT8
LSTAT9
LSTAT10
0
24.0
4.98
24.8004
123.505992
615.059840
3062.998004
15253.730060
7.596358e+04
3.782986e+05
1.883927e+06
9.381957e+06
1
21.6
9.14
83.5396
763.551944
6978.864768
63786.823980
583011.571200
5.328726e+06
4.870455e+07
4.451596e+08
4.068759e+09
2
34.7
4.03
16.2409
65.450827
263.766833
1062.980336
4283.810755
1.726376e+04
6.957294e+04
2.803790e+05
1.129927e+06
3
33.4
2.94
8.6436
25.412184
74.711821
219.652754
645.779096
1.898591e+03
5.581856e+03
1.641066e+04
4.824733e+04
4
36.2
5.33
28.4089
151.419437
807.065599
4301.659644
22927.845900
1.222054e+05
6.513549e+05
3.471722e+06
1.850428e+07
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
sns . scatterplot ( x = 'LSTAT' , y = 'price' , data = df )
plt . show ()
normalize the features!
from sklearn.preprocessing import StandardScaler
# Your code
scaler = StandardScaler ()
df_sc = scaler . fit_transform ( df )
df_sc [: 5 ]
array([[ 0.15968566, -1.0755623 , -0.78952949, -0.56845926, -0.42533928,
-0.33434062, -0.2740581 , -0.23195975, -0.20115811, -0.1777807 ,
-0.15953586],
[-0.10152429, -0.49243937, -0.54045362, -0.48104568, -0.39814056,
-0.32648529, -0.27189596, -0.23138362, -0.20100802, -0.17774223,
-0.15952611],
[ 1.32424667, -1.2087274 , -0.82582493, -0.57638808, -0.4268407 ,
-0.33459934, -0.27409987, -0.23196619, -0.20115907, -0.17778084,
-0.15953588],
[ 1.18275795, -1.36151682, -0.85804028, -0.58185631, -0.42764871,
-0.33470844, -0.27411373, -0.23196788, -0.20115927, -0.17778086,
-0.15953588],
[ 1.48750288, -1.02650148, -0.77422812, -0.56464701, -0.42451865,
-0.33418038, -0.27402887, -0.23195468, -0.20115726, -0.17778056,
-0.15953584]])
price
LSTAT
LSTAT2
LSTAT3
LSTAT4
LSTAT5
LSTAT6
LSTAT7
LSTAT8
LSTAT9
LSTAT10
count
506.000000
506.000000
506.000000
506.000000
5.060000e+02
5.060000e+02
5.060000e+02
5.060000e+02
5.060000e+02
5.060000e+02
5.060000e+02
mean
22.532806
12.653063
210.993989
4285.788793
1.001336e+05
2.587609e+06
7.198029e+07
2.114923e+09
6.477077e+10
2.048399e+12
6.645292e+13
std
9.197104
7.141062
236.061920
7329.288372
2.342059e+05
7.737927e+06
2.628503e+08
9.126326e+09
3.223061e+11
1.153345e+13
4.169512e+14
min
5.000000
1.730000
2.992900
5.177717
8.957450e+00
1.549639e+01
2.680875e+01
4.637914e+01
8.023592e+01
1.388081e+02
2.401381e+02
25%
17.025000
6.950000
48.303700
335.727443
2.333481e+03
1.621932e+04
1.127384e+05
7.836504e+05
5.447333e+06
3.786664e+07
2.632333e+08
50%
21.200000
11.360000
129.050000
1466.017088
1.665411e+04
1.891930e+05
2.149266e+06
2.441612e+07
2.773731e+08
3.151037e+09
3.579677e+10
75%
25.000000
16.955000
287.472100
4874.091998
8.264029e+04
1.401168e+06
2.375683e+07
4.027977e+08
6.829447e+09
1.157935e+11
1.963285e+12
max
50.000000
37.970000
1441.720900
54742.142570
2.078559e+06
7.892289e+07
2.996702e+09
1.137850e+11
4.320410e+12
1.640460e+14
6.228820e+15
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
df_sc = pd . DataFrame ( df_sc , columns = df . columns ) # Numpy type을 pandas DataFrame type으로 바꿔준다.
df_sc . head ()
# type(df_sc)
# df와 df_sc는 현재 동일한 데이터타입.
price
LSTAT
LSTAT2
LSTAT3
LSTAT4
LSTAT5
LSTAT6
LSTAT7
LSTAT8
LSTAT9
LSTAT10
0
0.159686
-1.075562
-0.789529
-0.568459
-0.425339
-0.334341
-0.274058
-0.231960
-0.201158
-0.177781
-0.159536
1
-0.101524
-0.492439
-0.540454
-0.481046
-0.398141
-0.326485
-0.271896
-0.231384
-0.201008
-0.177742
-0.159526
2
1.324247
-1.208727
-0.825825
-0.576388
-0.426841
-0.334599
-0.274100
-0.231966
-0.201159
-0.177781
-0.159536
3
1.182758
-1.361517
-0.858040
-0.581856
-0.427649
-0.334708
-0.274114
-0.231968
-0.201159
-0.177781
-0.159536
4
1.487503
-1.026501
-0.774228
-0.564647
-0.424519
-0.334180
-0.274029
-0.231955
-0.201157
-0.177781
-0.159536
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
sns . scatterplot ( x = 'LSTAT' , y = 'price' , data = df_sc )
plt . show ()
Splitting the data (train / test)
test_size=0.2, random_state=rand_state
from sklearn.model_selection import train_test_split
0 0.159686
1 -0.101524
2 1.324247
3 1.182758
4 1.487503
...
501 -0.014454
502 -0.210362
503 0.148802
504 -0.057989
505 -1.157248
Name: price, Length: 506, dtype: float64
X = df_sc . drop ( 'price' , axis = 1 )
X
LSTAT
LSTAT2
LSTAT3
LSTAT4
LSTAT5
LSTAT6
LSTAT7
LSTAT8
LSTAT9
LSTAT10
0
-1.075562
-0.789529
-0.568459
-0.425339
-0.334341
-0.274058
-0.231960
-0.201158
-0.177781
-0.159536
1
-0.492439
-0.540454
-0.481046
-0.398141
-0.326485
-0.271896
-0.231384
-0.201008
-0.177742
-0.159526
2
-1.208727
-0.825825
-0.576388
-0.426841
-0.334599
-0.274100
-0.231966
-0.201159
-0.177781
-0.159536
3
-1.361517
-0.858040
-0.581856
-0.427649
-0.334708
-0.274114
-0.231968
-0.201159
-0.177781
-0.159536
4
-1.026501
-0.774228
-0.564647
-0.424519
-0.334180
-0.274029
-0.231955
-0.201157
-0.177781
-0.159536
...
...
...
...
...
...
...
...
...
...
...
501
-0.418147
-0.498180
-0.461833
-0.390597
-0.323799
-0.271002
-0.231101
-0.200922
-0.177717
-0.159519
502
-0.500850
-0.545089
-0.483086
-0.398916
-0.326753
-0.271982
-0.231410
-0.201016
-0.177744
-0.159527
503
-0.983048
-0.759808
-0.560825
-0.423643
-0.333999
-0.273994
-0.231948
-0.201156
-0.177780
-0.159536
504
-0.865302
-0.716638
-0.548165
-0.420432
-0.333259
-0.273834
-0.231915
-0.201150
-0.177779
-0.159536
505
-0.669058
-0.631389
-0.518501
-0.411489
-0.330806
-0.273204
-0.231761
-0.201113
-0.177771
-0.159534
506 rows × 10 columns
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
X_train , X_test , y_train , y_test = train_test_split ( X , y , test_size = 0.2 , random_state = rand_state )
(404, 10)
(102, 10)
LSTAT
LSTAT2
LSTAT3
LSTAT4
LSTAT5
LSTAT6
LSTAT7
LSTAT8
LSTAT9
LSTAT10
300
-0.922773
-0.738456
-0.554782
-0.422166
-0.333671
-0.273926
-0.231935
-0.201154
-0.177780
-0.159536
32
2.110588
2.361250
2.320551
2.091900
1.778692
1.449896
1.143940
0.878417
0.658205
0.481244
181
-0.448985
-0.516017
-0.470071
-0.393883
-0.324988
-0.271404
-0.231230
-0.200962
-0.177729
-0.159522
272
-0.690084
-0.641318
-0.522245
-0.412708
-0.331167
-0.273304
-0.231787
-0.201120
-0.177772
-0.159534
477
1.718101
1.736491
1.525676
1.217641
0.905982
0.635721
0.420786
0.259256
0.142724
0.061306
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
X_test_wc = sm . add_constant ( X_test )
X_train_wc = sm . add_constant ( X_train )
X_train_wc . head ()
const
LSTAT
LSTAT2
LSTAT3
LSTAT4
LSTAT5
LSTAT6
LSTAT7
LSTAT8
LSTAT9
LSTAT10
300
1.0
-0.922773
-0.738456
-0.554782
-0.422166
-0.333671
-0.273926
-0.231935
-0.201154
-0.177780
-0.159536
32
1.0
2.110588
2.361250
2.320551
2.091900
1.778692
1.449896
1.143940
0.878417
0.658205
0.481244
181
1.0
-0.448985
-0.516017
-0.470071
-0.393883
-0.324988
-0.271404
-0.231230
-0.200962
-0.177729
-0.159522
272
1.0
-0.690084
-0.641318
-0.522245
-0.412708
-0.331167
-0.273304
-0.231787
-0.201120
-0.177772
-0.159534
477
1.0
1.718101
1.736491
1.525676
1.217641
0.905982
0.635721
0.420786
0.259256
0.142724
0.061306
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
model = sm . OLS ( y_train , X_train_wc ). fit ()
model . summary ()
OLS Regression Results
Dep. Variable: price R-squared: 0.677
Model: OLS Adj. R-squared: 0.669
Method: Least Squares F-statistic: 82.44
Date: Mon, 10 Jan 2022 Prob (F-statistic): 4.51e-90
Time: 08:17:17 Log-Likelihood: -344.23
No. Observations: 404 AIC: 710.5
Df Residuals: 393 BIC: 754.5
Df Model: 10
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 0.0120 0.029 0.417 0.677 -0.044 0.068
LSTAT 10.4856 14.789 0.709 0.479 -18.591 39.562
LSTAT2 -198.4474 183.865 -1.079 0.281 -559.929 163.035
LSTAT3 1218.5360 1157.209 1.053 0.293 -1056.559 3493.631
LSTAT4 -4039.5201 4549.147 -0.888 0.375 -1.3e+04 4904.187
LSTAT5 8029.8874 1.19e+04 0.675 0.500 -1.54e+04 3.14e+04
LSTAT6 -9630.6577 2.11e+04 -0.456 0.649 -5.11e+04 3.19e+04
LSTAT7 6399.3109 2.51e+04 0.255 0.799 -4.3e+04 5.58e+04
LSTAT8 -1554.6344 1.92e+04 -0.081 0.935 -3.93e+04 3.62e+04
LSTAT9 -523.9209 8485.139 -0.062 0.951 -1.72e+04 1.62e+04
LSTAT10 288.1740 1645.665 0.175 0.861 -2947.235 3523.583
Omnibus: 106.759 Durbin-Watson: 1.957
Prob(Omnibus): 0.000 Jarque-Bera (JB): 319.634
Skew: 1.217 Prob(JB): 3.91e-70
Kurtosis: 6.614 Cond. No. 4.56e+06
Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 4.56e+06. This might indicate that there are strong multicollinearity or other numerical problems.
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f34e1bb78d0>
Training the models
Linear regression (model_linear)
Ridge regression (model_ridge)
Lasso regression (model_lasso)
Elastic Net regression (model_net)
from sklearn.linear_model import LinearRegression , Ridge , RidgeCV , Lasso , LassoCV , ElasticNet , ElasticNetCV
# starting with default parameters:
model_linear = LinearRegression ()
model_ridge = Ridge ()
model_lasso = Lasso ()
model_net = ElasticNet ()
y_hat_linear = model_linear . fit ( X_train , y_train ). predict ( X_test )
y_hat_ridge = model_ridge . fit ( X_train , y_train ). predict ( X_test )
y_hat_lasso = model_lasso . fit ( X_train , y_train ). predict ( X_test )
y_hat_net = model_net . fit ( X_train , y_train ). predict ( X_test )
df_pred = pd . DataFrame ({ 'y_test' : y_test , 'y_hat_linear' : y_hat_linear , 'y_hat_ridge' : y_hat_ridge , 'y_hat_lasso' : y_hat_lasso , 'y_hat_net' : y_hat_net })
df_pred . head ()
y_test
y_hat_linear
y_hat_ridge
y_hat_lasso
y_hat_net
483
-0.079757
-0.019459
0.029108
0.009199
0.059533
426
-1.342272
-0.480570
-0.589703
0.009199
-0.054729
22
-0.798084
-0.736176
-0.786380
0.009199
-0.120425
268
2.282016
2.053967
1.495823
0.009199
0.216942
371
2.989460
0.041490
0.171371
0.009199
0.078830
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
df . drop ( 'price' , axis = 1 , inplace = False ). columns
Index(['LSTAT', 'LSTAT2', 'LSTAT3', 'LSTAT4', 'LSTAT5', 'LSTAT6', 'LSTAT7',
'LSTAT8', 'LSTAT9', 'LSTAT10'],
dtype='object')
coefficients = pd . DataFrame ({ 'Features' : df . drop ( 'price' , axis = 1 , inplace = False ). columns })
coefficients [ 'model_liner' ] = model_linear . coef_
coefficients [ 'model_ridge' ] = model_ridge . coef_
coefficients [ 'model_lasso' ] = model_lasso . coef_
coefficients [ 'model_elastic_net' ] = model_net . coef_
coefficients
Features
model_liner
model_ridge
model_lasso
model_elastic_net
0
LSTAT
10.485596
-1.997427
-0.0
-0.154677
1
LSTAT2
-198.447363
1.120503
-0.0
-0.000000
2
LSTAT3
1218.536028
0.705506
-0.0
-0.000000
3
LSTAT4
-4039.520112
-0.029523
-0.0
-0.000000
4
LSTAT5
8029.887428
-0.331986
-0.0
-0.000000
5
LSTAT6
-9630.657680
-0.305169
-0.0
-0.000000
6
LSTAT7
6399.310934
-0.150135
-0.0
-0.000000
7
LSTAT8
-1554.634399
0.007747
-0.0
-0.000000
8
LSTAT9
-523.920947
0.115016
-0.0
-0.000000
9
LSTAT10
288.174050
0.159595
-0.0
-0.000000
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
MSE_test = np . mean ( np . square ( df_pred [ 'y_test' ] - df_pred [ 'y_hat_linear' ]))
RMSE_test = np . sqrt ( MSE_test )
np . round ( RMSE_test , 3 )
0.541
MSE_test = np . mean ( np . square ( df_pred [ 'y_test' ] - df_pred [ 'y_hat_ridge' ]))
RMSE_test = np . sqrt ( MSE_test )
np . round ( RMSE_test , 3 )
0.559
MSE_test = np . mean ( np . square ( df_pred [ 'y_test' ] - df_pred [ 'y_hat_lasso' ]))
RMSE_test = np . sqrt ( MSE_test )
np . round ( RMSE_test , 3 )
1.006
MSE_test = np . mean ( np . square ( df_pred [ 'y_test' ] - df_pred [ 'y_hat_net' ]))
RMSE_test = np . sqrt ( MSE_test )
np . round ( RMSE_test , 3 )
0.898
Plotting the regression coefficients vs alphas:
1) Ridge
alpha_ridge = 10 ** np . linspace ( - 2 , 4 , 100 )
ridge = Ridge ()
coefs_ridge = []
for i in alpha_ridge :
ridge . set_params ( alpha = i )
ridge . fit ( X_train , y_train )
coefs_ridge . append ( ridge . coef_ )
np . shape ( coefs_ridge )
(100, 10)
plt . figure ( figsize = ( 12 , 10 ))
ax = plt . gca ()
ax . plot ( alpha_ridge , coefs_ridge )
ax . set_xscale ( 'log' )
plt . axis ( 'tight' )
plt . xlabel ( 'alpha' )
plt . ylabel ( 'weights: scaled coefficients' )
plt . title ( 'Ridge regression coefficients Vs. alpha' )
plt . legend ( df . drop ( 'price' , axis = 1 , inplace = False ). columns )
plt . show ()
2) Lasso
alpha_lasso = 10 ** np . linspace ( - 3 , 1 , 100 )
lasso = Lasso ()
coefs_lasso = []
for i in alpha_lasso :
lasso . set_params ( alpha = i )
lasso . fit ( X_train , y_train )
coefs_lasso . append ( lasso . coef_ )
np . shape ( coefs_lasso )
(100, 10)
plt . figure ( figsize = ( 12 , 10 ))
ax = plt . gca ()
ax . plot ( alpha_lasso , coefs_lasso )
ax . set_xscale ( 'log' )
plt . axis ( 'tight' )
plt . xlabel ( 'alpha' )
plt . ylabel ( 'weights: scaled coefficients' )
plt . title ( 'Lasso regression coefficients Vs. alpha' )
plt . legend ( df . drop ( 'price' , axis = 1 , inplace = False ). columns )
plt . show ()
3) ElasticNet
alpha_elasticnet = 10 ** np . linspace ( - 3 , 2 , 100 )
elasticnet = ElasticNet ()
coefs_elasticnet = []
for i in alpha_elasticnet :
elasticnet . set_params ( alpha = i )
elasticnet . fit ( X_train , y_train )
coefs_elasticnet . append ( elasticnet . coef_ )
np . shape ( coefs_elasticnet )
(100, 10)
plt . figure ( figsize = ( 12 , 10 ))
ax = plt . gca ()
ax . plot ( alpha_elasticnet , coefs_elasticnet )
ax . set_xscale ( 'log' )
plt . axis ( 'tight' )
plt . xlabel ( 'alpha' )
plt . ylabel ( 'weights: scaled coefficients' )
plt . title ( 'Elastic Net regression coefficients Vs. alpha' )
plt . legend ( df . drop ( 'price' , axis = 1 , inplace = False ). columns )
plt . show ()
Cross Validation
1) Ridge
ridgecv = RidgeCV ()
ridgecv . fit ( X_train , y_train )
ridgecv . alpha_
0.1
alpha_ridge_opt = ridgecv . alpha_
0.1
2) Lasso
lassocv = LassoCV ()
lassocv . fit ( X_train , y_train )
lassocv . alpha_
0.0007404280761639708
alpha_lasso_opt = lassocv . alpha_
RMSE_CV = []
iterator = np . arange ( 0.0 , 0.02 , 0.001 )
for i in iterator :
MSE = - cross_val_score ( estimator = Lasso ( alpha = i ), X = X_train , y = y_train , cv = 5 , scoring = "neg_mean_squared_error" )
RMSE_CV . append ( np . sqrt ( MSE ). mean ())
output = pd . DataFrame ( list ( iterator ), columns = [ 'lambda_Lasso' ])
output [ 'RMSE_CV' ] = RMSE_CV
output . head ()
lambda_Lasso
RMSE_CV
0
0.000
0.591622
1
0.001
0.595970
2
0.002
0.600641
3
0.003
0.604491
4
0.004
0.608323
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
output [ 'RMSE_CV' ]. idxmin ()
0
output [ 'lambda_Lasso' ][ output [ 'RMSE_CV' ]. idxmin ()]
0.0
sns . lineplot ( x = 'lambda_Lasso' , y = 'RMSE_CV' , data = output , color = 'r' , label = "RMSE_CV vs lambda_ridge" )
plt . show ()
3) ElasticNet
elasticnetcv = ElasticNetCV ()
elasticnetcv . fit ( X_train , y_train )
elasticnetcv . alpha_
0.0014808561523279417
0.5
from sklearn.model_selection import cross_val_score
import sklearn.metrics
RMSE_CV = []
iterator = np . arange ( 0.0 , 0.02 , 0.001 )
for i in iterator :
MSE = - cross_val_score ( estimator = ElasticNet ( alpha = i ), X = X_train , y = y_train , cv = 5 , scoring = "neg_mean_squared_error" )
RMSE_CV . append ( np . sqrt ( MSE ). mean ())
output = pd . DataFrame ( list ( iterator ), columns = [ 'lambda_ElasticNet' ])
output [ 'RMSE_CV' ] = RMSE_CV
output . head ( 25 )
lambda_ElasticNet
RMSE_CV
0
0.000
0.591622
1
0.001
0.597029
2
0.002
0.602498
3
0.003
0.606075
4
0.004
0.609711
5
0.005
0.612586
6
0.006
0.614557
7
0.007
0.616387
8
0.008
0.617937
9
0.009
0.619380
10
0.010
0.620785
11
0.011
0.622211
12
0.012
0.623522
13
0.013
0.624668
14
0.014
0.625692
15
0.015
0.626773
16
0.016
0.627784
17
0.017
0.628810
18
0.018
0.629684
19
0.019
0.630521
<svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
width="24px">
</svg>
output [ 'RMSE_CV' ]. idxmin ()
0
output [ 'lambda_ElasticNet' ][ output [ 'RMSE_CV' ]. idxmin ()]
0.0
sns . lineplot ( x = 'lambda_ElasticNet' , y = 'RMSE_CV' , data = output , color = 'r' , label = "RMSE_CV vs lambda_ridge" )
plt . show ()
댓글남기기