# Make Simulated Data For Clustering

## Goal¶

This post introduce how to create artificial data for clustering using numpy.

## Libraries¶

In [1]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


## Set parameters¶

In [2]:
# 3 clusters in 2D
d_means = {'cluster 1': [0, 0],
'cluster 2': [4, 5],
'cluster 3': [5, 0]}
d_covs = {'cluster 1': [[1, 1],
[1, 4]],
'cluster 2': [[1, 1],
[1, 3]],
'cluster 3': [[4, 2],
[2, 2]]}


### Generate random sampled data¶

In [28]:
df_tmp.head()

Out[28]:
0 1 label
0 5.722012 -0.453365 cluster 3
1 7.691619 0.958336 cluster 3
2 7.546710 2.130443 cluster 3
3 5.442839 -0.909432 cluster 3
4 5.235633 -0.138812 cluster 3
In [32]:
# Generate data based on the above parameters
n_data = 1000

# Generate data based on the above parameters
l = []
for cluster in d_means.keys():
arr = np.random.multivariate_normal(d_means[cluster], d_covs[cluster], n_data)
df_tmp = pd.DataFrame(arr)
df_tmp['label'] = cluster
l.append(df_tmp)
plt.plot(df_tmp[0], df_tmp[1], '.', label=cluster, alpha=0.5)

plt.legend()
plt.axis('off')
plt.show()


### Created Data¶

In [4]:
df_data.head()

Out[4]:
x y
0 1.141102 0.398633
1 1.012627 -5.213305
2 -0.446332 3.922366
3 -0.961115 1.779184
4 0.191585 0.500205
In [5]:
df_data.shape

Out[5]:
(3000, 2)

## Make it as a function¶

In [24]:
def create_clustered_data(d_means, d_covs, n_data=1000):
"""create artificial data for clustering

Parameters
----------
d_means : dict
a dictionary of cluster means matrix.
The key is cluster name and the value.
Each value will be passed to np.random.multivariate_normal as mean
d_covs : dict
a dictionary of cluster covariance matrix.
The key is cluster name and the value.
Each value will be passed to np.random.multivariate_normal as covariance

Returns
-------
pd.DataFrame

"""
# Generate data based on the above parameters
l = []
for cluster in d_means.keys():
arr = np.random.multivariate_normal(d_means[cluster], d_covs[cluster], n_data)
df_tmp = pd.DataFrame(arr)
df_tmp['label'] = cluster
l.append(df_tmp)
return pd.concat(l)