Make Simulated Data For Clustering
Goal¶
This post introduce how to create artificial data for clustering using numpy
.
Libraries¶
In [1]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
Set parameters¶
In [2]:
# 3 clusters in 2D
d_means = {'cluster 1': [0, 0],
'cluster 2': [4, 5],
'cluster 3': [5, 0]}
d_covs = {'cluster 1': [[1, 1],
[1, 4]],
'cluster 2': [[1, 1],
[1, 3]],
'cluster 3': [[4, 2],
[2, 2]]}
Generate random sampled data¶
In [28]:
df_tmp.head()
Out[28]:
In [32]:
# Generate data based on the above parameters
n_data = 1000
# Generate data based on the above parameters
l = []
for cluster in d_means.keys():
arr = np.random.multivariate_normal(d_means[cluster], d_covs[cluster], n_data)
df_tmp = pd.DataFrame(arr)
df_tmp['label'] = cluster
l.append(df_tmp)
plt.plot(df_tmp[0], df_tmp[1], '.', label=cluster, alpha=0.5)
plt.legend()
plt.axis('off')
plt.show()
Created Data¶
In [4]:
df_data.head()
Out[4]:
In [5]:
df_data.shape
Out[5]:
Make it as a function¶
In [24]:
def create_clustered_data(d_means, d_covs, n_data=1000):
"""create artificial data for clustering
Parameters
----------
d_means : dict
a dictionary of cluster means matrix.
The key is cluster name and the value.
Each value will be passed to np.random.multivariate_normal as mean
d_covs : dict
a dictionary of cluster covariance matrix.
The key is cluster name and the value.
Each value will be passed to np.random.multivariate_normal as covariance
Returns
-------
pd.DataFrame
"""
# Generate data based on the above parameters
l = []
for cluster in d_means.keys():
arr = np.random.multivariate_normal(d_means[cluster], d_covs[cluster], n_data)
df_tmp = pd.DataFrame(arr)
df_tmp['label'] = cluster
l.append(df_tmp)
return pd.concat(l)
create_clustered_data(d_means, d_covs, n_data=1000).head()
Out[24]:
Comments
Comments powered by Disqus