Drop Highly Correlated Features
Libraries¶
In [8]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
import seaborn as sns
Create a data with highly correlated variables¶
Load boston housing data¶
In [4]:
boston = load_boston()
df_boston = pd.DataFrame(boston.data, columns=boston.feature_names)
df_boston.head()
Out[4]:
Add another correlated feature¶
In [6]:
df_boston['CRIM_correlated'] = df_boston['CRIM'] * 3 + 10 + np.random.random(df_boston.shape[0])
df_boston.head()
Out[6]:
Calclate Correlation¶
In [7]:
df_corr = df_boston.corr()
df_corr.head()
Out[7]:
In [10]:
sns.heatmap(df_corr);
Drop highly correlated feature¶
In [35]:
threshold = 0.9
columns = np.full((df_corr.shape[0],), True, dtype=bool)
for i in range(df_corr.shape[0]):
for j in range(i+1, df_corr.shape[0]):
if df_corr.iloc[i,j] >= threshold:
if columns[j]:
columns[j] = False
selected_columns = df_boston.columns[columns]
selected_columns
df_boston = df_boston[selected_columns]
In [36]:
df_boston.head()
Out[36]: