Blogs · Machine Learning · Clustering · Unsupervised Learning

Clustering: DBSCAN

Graphical methods continued...

2020.02.06 · 5 min read · by Zhenlin Wang · updated 2021-09-01

DBSCAN Introduction

2. Pros & Cons

Pros

Cons

3. Code Implementation

{% codeblock lang:python%} import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np from scipy import stats

from sklearn.cluster import DBSCAN from sklearn.metrics import silhouette_score

To choose the best combination of the algorithm parameters I will first create a matrix of investigated combinations.

from itertools import product

mall_data = pd.read_csv(‘Mall_Customers.csv’) X_numerics = mall_data[[‘Age’, ‘Annual Income (k$)’, ‘Spending Score (1-100)’]] # subset with numeric variables only

eps_values = np.arange(8,12.75,0.25) # eps values to be investigated min_samples = np.arange(3,10) # min_samples values to be investigated DBSCAN_params = list(product(eps_values, min_samples))

no_of_clusters = [] sil_score = []

for p in DBSCAN_params: DBS_clustering = DBSCAN(eps=p[0], min_samples=p[1]).fit(X_numerics) no_of_clusters.append(len(np.unique(DBS_clustering.labels_))) sil_score.append(silhouette_score(X_numerics, DBS_clustering.labels_))

tmp = pd.DataFrame.from_records(DBSCAN_params, columns =[‘Eps’, ‘Min_samples’])
tmp[‘No_of_clusters’] = no_of_clusters

pivot_1 = pd.pivot_table(tmp, values=‘No_of_clusters’, index=‘Min_samples’, columns=‘Eps’)

fig, ax = plt.subplots(figsize=(12,6)) sns.heatmap(pivot_1, annot=True,annot_kws={“size”: 16}, cmap=“YlGnBu”, ax=ax) ax.set_title(‘Number of clusters’) plt.show()

tmp = pd.DataFrame.from_records(DBSCAN_params, columns =[‘Eps’, ‘Min_samples’])
tmp[‘Sil_score’] = sil_score

pivot_1 = pd.pivot_table(tmp, values=‘Sil_score’, index=‘Min_samples’, columns=‘Eps’)

fig, ax = plt.subplots(figsize=(18,6)) sns.heatmap(pivot_1, annot=True, annot_kws={“size”: 10}, cmap=“YlGnBu”, ax=ax) plt.show()

DBS_clustering = DBSCAN(eps=12.5, min_samples=4).fit(X_numerics)

DBSCAN_clustered = X_numerics.copy() DBSCAN_clustered.loc[:,‘Cluster’] = DBS_clustering.labels_ # append labels to points

DBSCAN_clust_sizes = DBSCAN_clustered.groupby(‘Cluster’).size().to_frame() DBSCAN_clust_sizes.columns = [“DBSCAN_size”] display(DBSCAN_clust_sizes)

outliers = DBSCAN_clustered[DBSCAN_clustered[‘Cluster’]==-1]

fig2, (axes) = plt.subplots(1,2,figsize=(12,5))

sns.scatterplot(‘Annual Income (k$)’, ‘Spending Score (1-100)’, data=DBSCAN_clustered[DBSCAN_clustered[‘Cluster’]!=-1], hue=‘Cluster’, ax=axes[0], palette=‘Set1’, legend=‘full’, s=45)

sns.scatterplot(‘Age’, ‘Spending Score (1-100)’, data=DBSCAN_clustered[DBSCAN_clustered[‘Cluster’]!=-1], hue=‘Cluster’, palette=‘Set1’, ax=axes[1], legend=‘full’, s=45)

axes[0].scatter(outliers[‘Annual Income (k$)’], outliers[‘Spending Score (1-100)’], s=5, label=‘outliers’, c=“k”) axes[1].scatter(outliers[‘Age’], outliers[‘Spending Score (1-100)’], s=5, label=‘outliers’, c=“k”) axes[0].legend() axes[1].legend() plt.setp(axes[0].get_legend().get_texts(), fontsize=‘10’) plt.setp(axes[1].get_legend().get_texts(), fontsize=‘10’)

plt.show() {% endcodeblock %}