Posts about Feature Selection

Drop Highly Correlated Features

Goal

This post aims to introduce how to drop highly correlated features.

Reference

Libraries

In [8]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
import seaborn as sns

Create a data with highly correlated variables

Load boston housing data

In [4]:
boston = load_boston()
df_boston = pd.DataFrame(boston.data, columns=boston.feature_names)
df_boston.head()
Out[4]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33

Add another correlated feature

In [6]:
df_boston['CRIM_correlated'] = df_boston['CRIM'] * 3 + 10 + np.random.random(df_boston.shape[0])
df_boston.head()
Out[6]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT CRIM_correlated
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 10.284178
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 10.102942
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 10.387687
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 10.607908
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 10.824663

Calclate Correlation

In [7]:
df_corr = df_boston.corr()
df_corr.head()
Out[7]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT CRIM_correlated
CRIM 1.000000 -0.200469 0.406583 -0.055892 0.420972 -0.219247 0.352734 -0.379670 0.625505 0.582764 0.289946 -0.385064 0.455621 0.999937
ZN -0.200469 1.000000 -0.533828 -0.042697 -0.516604 0.311991 -0.569537 0.664408 -0.311948 -0.314563 -0.391679 0.175520 -0.412995 -0.200756
INDUS 0.406583 -0.533828 1.000000 0.062938 0.763651 -0.391676 0.644779 -0.708027 0.595129 0.720760 0.383248 -0.356977 0.603800 0.406720
CHAS -0.055892 -0.042697 0.062938 1.000000 0.091203 0.091251 0.086518 -0.099176 -0.007368 -0.035587 -0.121515 0.048788 -0.053929 -0.055514
NOX 0.420972 -0.516604 0.763651 0.091203 1.000000 -0.302188 0.731470 -0.769230 0.611441 0.668023 0.188933 -0.380051 0.590879 0.421744
In [10]:
sns.heatmap(df_corr);

Drop highly correlated feature

In [35]:
threshold = 0.9


columns = np.full((df_corr.shape[0],), True, dtype=bool)
for i in range(df_corr.shape[0]):
    for j in range(i+1, df_corr.shape[0]):
        if df_corr.iloc[i,j] >= threshold:
            if columns[j]:
                columns[j] = False
selected_columns = df_boston.columns[columns]
selected_columns
df_boston = df_boston[selected_columns]
In [36]:
df_boston.head()
Out[36]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD PTRATIO B LSTAT
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 18.7 396.90 5.33

Variance Thresholding For Feature Selection

Goal

This post aims to introduce how to conduct feature selection by variance thresholding

Reference

Libraries

In [26]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Load Boston housing data

In [4]:
boston = load_boston()
df_boston = pd.DataFrame(boston.data, columns=boston.feature_names)
df_boston.head()
Out[4]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33

Add low variance columns

In [14]:
np.var(np.random.random(size=df_boston.shape[0]))
Out[14]:
0.08309365785384086
In [16]:
df_boston['low_variance'] = 1
df_boston['low_variance2'] = (np.random.random(size=df_boston.shape[0]) > 0.5) * 1
df_boston.head()
Out[16]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT low_variance low_variance2
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 1 0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 1 1
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 1 0
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 1 1
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 1 1

Variance Thresholding

In [50]:
variance_threshold = 0.1
selection = VarianceThreshold(threshold=variance_threshold)
selection.fit(df_boston)
Out[50]:
VarianceThreshold(threshold=0.1)
In [51]:
ax = pd.Series(selection.variances_, index=df_boston.columns).plot(kind='bar', logy=True);
ax.axhline(variance_threshold, ls='dotted', c='r');
In [52]:
df_boston_selected = pd.DataFrame(selection.transform(df_boston), columns=df_boston.columns[selection.get_support()])
df_boston_selected.head()
Out[52]:
CRIM ZN INDUS RM AGE DIS RAD TAX PTRATIO B LSTAT low_variance2
0 0.00632 18.0 2.31 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 0.0
1 0.02731 0.0 7.07 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 1.0
2 0.02729 0.0 7.07 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 0.0
3 0.03237 0.0 2.18 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 1.0
4 0.06905 0.0 2.18 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 1.0