In this notebook we will explore the three methods and compare their results with a multiple linear regression model applied to Boston Housing dataset. The target variable is price and the features are 10 polynomial features of LSTAT: % lower status of the population. LSTAT2= LSTAT2 , LSTAT3= LSTAT3 , and etc.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
sns.set()  #if you want to use seaborn themes with matplotlib functions
import warnings
warnings.filterwarnings('ignore')
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
from google.colab import drive
drive.mount('/content/drive')

# 드라이브 마운트 csv 파일 경로 지정( 코랩에서 돌렸다. )
Mounted at /content/drive
rand_state= 1000
df = pd.read_csv("/content/drive/MyDrive/PyTorch(YearDream)/2022-01-10_DAY15/Regularization_Boston.csv")
df.head()
price LSTAT LSTAT2 LSTAT3 LSTAT4 LSTAT5 LSTAT6 LSTAT7 LSTAT8 LSTAT9 LSTAT10
0 24.0 4.98 24.8004 123.505992 615.059840 3062.998004 15253.730060 7.596358e+04 3.782986e+05 1.883927e+06 9.381957e+06
1 21.6 9.14 83.5396 763.551944 6978.864768 63786.823980 583011.571200 5.328726e+06 4.870455e+07 4.451596e+08 4.068759e+09
2 34.7 4.03 16.2409 65.450827 263.766833 1062.980336 4283.810755 1.726376e+04 6.957294e+04 2.803790e+05 1.129927e+06
3 33.4 2.94 8.6436 25.412184 74.711821 219.652754 645.779096 1.898591e+03 5.581856e+03 1.641066e+04 4.824733e+04
4 36.2 5.33 28.4089 151.419437 807.065599 4301.659644 22927.845900 1.222054e+05 6.513549e+05 3.471722e+06 1.850428e+07
sns.scatterplot(x='LSTAT', y='price', data=df)
plt.show()

normalize the features!

from sklearn.preprocessing import StandardScaler
# Your code
scaler = StandardScaler()
df_sc = scaler.fit_transform(df)
df_sc[:5]
array([[ 0.15968566, -1.0755623 , -0.78952949, -0.56845926, -0.42533928,
        -0.33434062, -0.2740581 , -0.23195975, -0.20115811, -0.1777807 ,
        -0.15953586],
       [-0.10152429, -0.49243937, -0.54045362, -0.48104568, -0.39814056,
        -0.32648529, -0.27189596, -0.23138362, -0.20100802, -0.17774223,
        -0.15952611],
       [ 1.32424667, -1.2087274 , -0.82582493, -0.57638808, -0.4268407 ,
        -0.33459934, -0.27409987, -0.23196619, -0.20115907, -0.17778084,
        -0.15953588],
       [ 1.18275795, -1.36151682, -0.85804028, -0.58185631, -0.42764871,
        -0.33470844, -0.27411373, -0.23196788, -0.20115927, -0.17778086,
        -0.15953588],
       [ 1.48750288, -1.02650148, -0.77422812, -0.56464701, -0.42451865,
        -0.33418038, -0.27402887, -0.23195468, -0.20115726, -0.17778056,
        -0.15953584]])
df.describe()
price LSTAT LSTAT2 LSTAT3 LSTAT4 LSTAT5 LSTAT6 LSTAT7 LSTAT8 LSTAT9 LSTAT10
count 506.000000 506.000000 506.000000 506.000000 5.060000e+02 5.060000e+02 5.060000e+02 5.060000e+02 5.060000e+02 5.060000e+02 5.060000e+02
mean 22.532806 12.653063 210.993989 4285.788793 1.001336e+05 2.587609e+06 7.198029e+07 2.114923e+09 6.477077e+10 2.048399e+12 6.645292e+13
std 9.197104 7.141062 236.061920 7329.288372 2.342059e+05 7.737927e+06 2.628503e+08 9.126326e+09 3.223061e+11 1.153345e+13 4.169512e+14
min 5.000000 1.730000 2.992900 5.177717 8.957450e+00 1.549639e+01 2.680875e+01 4.637914e+01 8.023592e+01 1.388081e+02 2.401381e+02
25% 17.025000 6.950000 48.303700 335.727443 2.333481e+03 1.621932e+04 1.127384e+05 7.836504e+05 5.447333e+06 3.786664e+07 2.632333e+08
50% 21.200000 11.360000 129.050000 1466.017088 1.665411e+04 1.891930e+05 2.149266e+06 2.441612e+07 2.773731e+08 3.151037e+09 3.579677e+10
75% 25.000000 16.955000 287.472100 4874.091998 8.264029e+04 1.401168e+06 2.375683e+07 4.027977e+08 6.829447e+09 1.157935e+11 1.963285e+12
max 50.000000 37.970000 1441.720900 54742.142570 2.078559e+06 7.892289e+07 2.996702e+09 1.137850e+11 4.320410e+12 1.640460e+14 6.228820e+15
df_sc = pd.DataFrame(df_sc, columns=df.columns)     # Numpy type을 pandas DataFrame type으로 바꿔준다.
df_sc.head()
# type(df_sc)
# df와 df_sc는 현재 동일한 데이터타입.
price LSTAT LSTAT2 LSTAT3 LSTAT4 LSTAT5 LSTAT6 LSTAT7 LSTAT8 LSTAT9 LSTAT10
0 0.159686 -1.075562 -0.789529 -0.568459 -0.425339 -0.334341 -0.274058 -0.231960 -0.201158 -0.177781 -0.159536
1 -0.101524 -0.492439 -0.540454 -0.481046 -0.398141 -0.326485 -0.271896 -0.231384 -0.201008 -0.177742 -0.159526
2 1.324247 -1.208727 -0.825825 -0.576388 -0.426841 -0.334599 -0.274100 -0.231966 -0.201159 -0.177781 -0.159536
3 1.182758 -1.361517 -0.858040 -0.581856 -0.427649 -0.334708 -0.274114 -0.231968 -0.201159 -0.177781 -0.159536
4 1.487503 -1.026501 -0.774228 -0.564647 -0.424519 -0.334180 -0.274029 -0.231955 -0.201157 -0.177781 -0.159536
sns.scatterplot(x='LSTAT', y='price', data=df_sc)
plt.show()





Splitting the data (train / test)

test_size=0.2, random_state=rand_state

from sklearn.model_selection import train_test_split
y=df_sc['price']
y
0      0.159686
1     -0.101524
2      1.324247
3      1.182758
4      1.487503
         ...   
501   -0.014454
502   -0.210362
503    0.148802
504   -0.057989
505   -1.157248
Name: price, Length: 506, dtype: float64
X = df_sc.drop('price', axis=1)
X
LSTAT LSTAT2 LSTAT3 LSTAT4 LSTAT5 LSTAT6 LSTAT7 LSTAT8 LSTAT9 LSTAT10
0 -1.075562 -0.789529 -0.568459 -0.425339 -0.334341 -0.274058 -0.231960 -0.201158 -0.177781 -0.159536
1 -0.492439 -0.540454 -0.481046 -0.398141 -0.326485 -0.271896 -0.231384 -0.201008 -0.177742 -0.159526
2 -1.208727 -0.825825 -0.576388 -0.426841 -0.334599 -0.274100 -0.231966 -0.201159 -0.177781 -0.159536
3 -1.361517 -0.858040 -0.581856 -0.427649 -0.334708 -0.274114 -0.231968 -0.201159 -0.177781 -0.159536
4 -1.026501 -0.774228 -0.564647 -0.424519 -0.334180 -0.274029 -0.231955 -0.201157 -0.177781 -0.159536
... ... ... ... ... ... ... ... ... ... ...
501 -0.418147 -0.498180 -0.461833 -0.390597 -0.323799 -0.271002 -0.231101 -0.200922 -0.177717 -0.159519
502 -0.500850 -0.545089 -0.483086 -0.398916 -0.326753 -0.271982 -0.231410 -0.201016 -0.177744 -0.159527
503 -0.983048 -0.759808 -0.560825 -0.423643 -0.333999 -0.273994 -0.231948 -0.201156 -0.177780 -0.159536
504 -0.865302 -0.716638 -0.548165 -0.420432 -0.333259 -0.273834 -0.231915 -0.201150 -0.177779 -0.159536
505 -0.669058 -0.631389 -0.518501 -0.411489 -0.330806 -0.273204 -0.231761 -0.201113 -0.177771 -0.159534

506 rows × 10 columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rand_state)
X_train.shape
(404, 10)
X_test.shape
(102, 10)
X_train.head()
LSTAT LSTAT2 LSTAT3 LSTAT4 LSTAT5 LSTAT6 LSTAT7 LSTAT8 LSTAT9 LSTAT10
300 -0.922773 -0.738456 -0.554782 -0.422166 -0.333671 -0.273926 -0.231935 -0.201154 -0.177780 -0.159536
32 2.110588 2.361250 2.320551 2.091900 1.778692 1.449896 1.143940 0.878417 0.658205 0.481244
181 -0.448985 -0.516017 -0.470071 -0.393883 -0.324988 -0.271404 -0.231230 -0.200962 -0.177729 -0.159522
272 -0.690084 -0.641318 -0.522245 -0.412708 -0.331167 -0.273304 -0.231787 -0.201120 -0.177772 -0.159534
477 1.718101 1.736491 1.525676 1.217641 0.905982 0.635721 0.420786 0.259256 0.142724 0.061306
X_test_wc = sm.add_constant(X_test)
X_train_wc = sm.add_constant(X_train)
X_train_wc.head()
const LSTAT LSTAT2 LSTAT3 LSTAT4 LSTAT5 LSTAT6 LSTAT7 LSTAT8 LSTAT9 LSTAT10
300 1.0 -0.922773 -0.738456 -0.554782 -0.422166 -0.333671 -0.273926 -0.231935 -0.201154 -0.177780 -0.159536
32 1.0 2.110588 2.361250 2.320551 2.091900 1.778692 1.449896 1.143940 0.878417 0.658205 0.481244
181 1.0 -0.448985 -0.516017 -0.470071 -0.393883 -0.324988 -0.271404 -0.231230 -0.200962 -0.177729 -0.159522
272 1.0 -0.690084 -0.641318 -0.522245 -0.412708 -0.331167 -0.273304 -0.231787 -0.201120 -0.177772 -0.159534
477 1.0 1.718101 1.736491 1.525676 1.217641 0.905982 0.635721 0.420786 0.259256 0.142724 0.061306
model = sm.OLS(y_train, X_train_wc).fit()
model.summary()
OLS Regression Results
Dep. Variable: price R-squared: 0.677
Model: OLS Adj. R-squared: 0.669
Method: Least Squares F-statistic: 82.44
Date: Mon, 10 Jan 2022 Prob (F-statistic): 4.51e-90
Time: 08:17:17 Log-Likelihood: -344.23
No. Observations: 404 AIC: 710.5
Df Residuals: 393 BIC: 754.5
Df Model: 10
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 0.0120 0.029 0.417 0.677 -0.044 0.068
LSTAT 10.4856 14.789 0.709 0.479 -18.591 39.562
LSTAT2 -198.4474 183.865 -1.079 0.281 -559.929 163.035
LSTAT3 1218.5360 1157.209 1.053 0.293 -1056.559 3493.631
LSTAT4 -4039.5201 4549.147 -0.888 0.375 -1.3e+04 4904.187
LSTAT5 8029.8874 1.19e+04 0.675 0.500 -1.54e+04 3.14e+04
LSTAT6 -9630.6577 2.11e+04 -0.456 0.649 -5.11e+04 3.19e+04
LSTAT7 6399.3109 2.51e+04 0.255 0.799 -4.3e+04 5.58e+04
LSTAT8 -1554.6344 1.92e+04 -0.081 0.935 -3.93e+04 3.62e+04
LSTAT9 -523.9209 8485.139 -0.062 0.951 -1.72e+04 1.62e+04
LSTAT10 288.1740 1645.665 0.175 0.861 -2947.235 3523.583
Omnibus: 106.759 Durbin-Watson: 1.957
Prob(Omnibus): 0.000 Jarque-Bera (JB): 319.634
Skew: 1.217 Prob(JB): 3.91e-70
Kurtosis: 6.614 Cond. No. 4.56e+06



Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.56e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

model
<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x7f34e1bb78d0>




Training the models

  1. Linear regression (model_linear)

  2. Ridge regression (model_ridge)

  3. Lasso regression (model_lasso)

  4. Elastic Net regression (model_net)

from sklearn.linear_model import LinearRegression, Ridge,RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
# starting with default parameters:
model_linear = LinearRegression()
model_ridge = Ridge()
model_lasso = Lasso()
model_net = ElasticNet()
y_hat_linear = model_linear.fit(X_train, y_train).predict(X_test)
y_hat_ridge = model_ridge.fit(X_train, y_train).predict(X_test)
y_hat_lasso = model_lasso.fit(X_train, y_train).predict(X_test)
y_hat_net = model_net.fit(X_train, y_train).predict(X_test)
df_pred = pd.DataFrame({'y_test':y_test, 'y_hat_linear':y_hat_linear, 'y_hat_ridge':y_hat_ridge, 'y_hat_lasso':y_hat_lasso, 'y_hat_net':y_hat_net})
df_pred.head()

y_test y_hat_linear y_hat_ridge y_hat_lasso y_hat_net
483 -0.079757 -0.019459 0.029108 0.009199 0.059533
426 -1.342272 -0.480570 -0.589703 0.009199 -0.054729
22 -0.798084 -0.736176 -0.786380 0.009199 -0.120425
268 2.282016 2.053967 1.495823 0.009199 0.216942
371 2.989460 0.041490 0.171371 0.009199 0.078830
df.drop('price', axis=1, inplace=False).columns
Index(['LSTAT', 'LSTAT2', 'LSTAT3', 'LSTAT4', 'LSTAT5', 'LSTAT6', 'LSTAT7',
       'LSTAT8', 'LSTAT9', 'LSTAT10'],
      dtype='object')
coefficients = pd.DataFrame({'Features':df.drop('price', axis=1, inplace=False).columns})
coefficients['model_liner']=model_linear.coef_
coefficients['model_ridge']=model_ridge.coef_
coefficients['model_lasso']=model_lasso.coef_
coefficients['model_elastic_net']=model_net.coef_
coefficients
Features model_liner model_ridge model_lasso model_elastic_net
0 LSTAT 10.485596 -1.997427 -0.0 -0.154677
1 LSTAT2 -198.447363 1.120503 -0.0 -0.000000
2 LSTAT3 1218.536028 0.705506 -0.0 -0.000000
3 LSTAT4 -4039.520112 -0.029523 -0.0 -0.000000
4 LSTAT5 8029.887428 -0.331986 -0.0 -0.000000
5 LSTAT6 -9630.657680 -0.305169 -0.0 -0.000000
6 LSTAT7 6399.310934 -0.150135 -0.0 -0.000000
7 LSTAT8 -1554.634399 0.007747 -0.0 -0.000000
8 LSTAT9 -523.920947 0.115016 -0.0 -0.000000
9 LSTAT10 288.174050 0.159595 -0.0 -0.000000





Performance in the test set for 4 models.

MSE_test = np.mean(np.square(df_pred['y_test'] - df_pred['y_hat_linear']))
RMSE_test = np.sqrt(MSE_test)
np.round(RMSE_test,3)
0.541
MSE_test = np.mean(np.square(df_pred['y_test'] - df_pred['y_hat_ridge']))
RMSE_test = np.sqrt(MSE_test)
np.round(RMSE_test,3)
0.559
MSE_test = np.mean(np.square(df_pred['y_test'] - df_pred['y_hat_lasso']))
RMSE_test = np.sqrt(MSE_test)
np.round(RMSE_test,3)
1.006
MSE_test = np.mean(np.square(df_pred['y_test'] - df_pred['y_hat_net']))
RMSE_test = np.sqrt(MSE_test)
np.round(RMSE_test,3)
0.898



Plotting the regression coefficients vs alphas:

1) Ridge

alpha_ridge = 10**np.linspace(-2,4,100)
ridge = Ridge()
coefs_ridge = []

for i in alpha_ridge:
    ridge.set_params(alpha = i)
    ridge.fit(X_train, y_train)
    coefs_ridge.append(ridge.coef_)
    
np.shape(coefs_ridge)
(100, 10)
plt.figure(figsize=(12,10))
ax = plt.gca()
ax.plot(alpha_ridge, coefs_ridge)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights: scaled coefficients')
plt.title('Ridge regression coefficients Vs. alpha')
plt.legend(df.drop('price',axis=1, inplace=False).columns)

plt.show()


2) Lasso

alpha_lasso = 10**np.linspace(-3,1,100)
lasso = Lasso()
coefs_lasso = []

for i in alpha_lasso:
    lasso.set_params(alpha = i)
    lasso.fit(X_train, y_train)
    coefs_lasso.append(lasso.coef_)
    
np.shape(coefs_lasso)
(100, 10)
plt.figure(figsize=(12,10))
ax = plt.gca()
ax.plot(alpha_lasso, coefs_lasso)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights: scaled coefficients')
plt.title('Lasso regression coefficients Vs. alpha')
plt.legend(df.drop('price',axis=1, inplace=False).columns)

plt.show()

3) ElasticNet

alpha_elasticnet = 10**np.linspace(-3,2,100)
elasticnet = ElasticNet()
coefs_elasticnet = []

for i in alpha_elasticnet:
    elasticnet.set_params(alpha = i)
    elasticnet.fit(X_train, y_train)
    coefs_elasticnet.append(elasticnet.coef_)
    
np.shape(coefs_elasticnet)
(100, 10)
plt.figure(figsize=(12,10))
ax = plt.gca()
ax.plot(alpha_elasticnet, coefs_elasticnet)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights: scaled coefficients')
plt.title('Elastic Net regression coefficients Vs. alpha')
plt.legend(df.drop('price',axis=1, inplace=False).columns)

plt.show()

Cross Validation

1) Ridge

ridgecv = RidgeCV()
ridgecv.fit(X_train, y_train)
ridgecv.alpha_
0.1
alpha_ridge_opt = ridgecv.alpha_
0.1

2) Lasso

lassocv = LassoCV()
lassocv.fit(X_train, y_train)
lassocv.alpha_
0.0007404280761639708
alpha_lasso_opt = lassocv.alpha_
RMSE_CV=[]
iterator= np.arange(0.0,0.02,0.001)
for i in iterator:
    MSE = -cross_val_score(estimator = Lasso(alpha=i), X = X_train, y = y_train, cv = 5 , scoring="neg_mean_squared_error" )
    RMSE_CV.append(np.sqrt(MSE).mean())
    
output = pd.DataFrame(list(iterator), columns=['lambda_Lasso'])
output['RMSE_CV']=RMSE_CV

output.head()
lambda_Lasso RMSE_CV
0 0.000 0.591622
1 0.001 0.595970
2 0.002 0.600641
3 0.003 0.604491
4 0.004 0.608323
output['RMSE_CV'].idxmin()
0
output['lambda_Lasso'][output['RMSE_CV'].idxmin()]
0.0
sns.lineplot(x='lambda_Lasso', y='RMSE_CV', data=output , color='r', label="RMSE_CV vs lambda_ridge")
plt.show()


3) ElasticNet

elasticnetcv = ElasticNetCV()
elasticnetcv.fit(X_train, y_train)
elasticnetcv.alpha_
0.0014808561523279417
elasticnetcv.l1_ratio_
0.5
from sklearn.model_selection import cross_val_score
import sklearn.metrics
RMSE_CV=[]
iterator= np.arange(0.0,0.02,0.001)
for i in iterator:
    MSE = -cross_val_score(estimator = ElasticNet(alpha=i), X = X_train, y = y_train, cv = 5 , scoring="neg_mean_squared_error" )
    RMSE_CV.append(np.sqrt(MSE).mean())
    
output = pd.DataFrame(list(iterator), columns=['lambda_ElasticNet'])
output['RMSE_CV'] = RMSE_CV

output.head(25)
lambda_ElasticNet RMSE_CV
0 0.000 0.591622
1 0.001 0.597029
2 0.002 0.602498
3 0.003 0.606075
4 0.004 0.609711
5 0.005 0.612586
6 0.006 0.614557
7 0.007 0.616387
8 0.008 0.617937
9 0.009 0.619380
10 0.010 0.620785
11 0.011 0.622211
12 0.012 0.623522
13 0.013 0.624668
14 0.014 0.625692
15 0.015 0.626773
16 0.016 0.627784
17 0.017 0.628810
18 0.018 0.629684
19 0.019 0.630521
output['RMSE_CV'].idxmin()
0
output['lambda_ElasticNet'][output['RMSE_CV'].idxmin()]
0.0
sns.lineplot(x='lambda_ElasticNet', y='RMSE_CV', data=output , color='r', label="RMSE_CV vs lambda_ridge")
plt.show()


댓글남기기