In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
In [2]:
data = pd.read_csv('Mall_Customers.csv')
In [3]:
data.head()
Out[3]:
CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 |
1 | 2 | Male | 21 | 15 | 81 |
2 | 3 | Female | 20 | 16 | 6 |
3 | 4 | Female | 23 | 16 | 77 |
4 | 5 | Female | 31 | 17 | 40 |
Univariate Analysis¶
In [6]:
data.describe()
Out[6]:
CustomerID | Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|---|
count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
mean | 100.500000 | 38.850000 | 60.560000 | 50.200000 |
std | 57.879185 | 13.969007 | 26.264721 | 25.823522 |
min | 1.000000 | 18.000000 | 15.000000 | 1.000000 |
25% | 50.750000 | 28.750000 | 41.500000 | 34.750000 |
50% | 100.500000 | 36.000000 | 61.500000 | 50.000000 |
75% | 150.250000 | 49.000000 | 78.000000 | 73.000000 |
max | 200.000000 | 70.000000 | 137.000000 | 99.000000 |
We find a normal distribution on this histogram¶
In [246]:
sns.distplot(data['Annual Income (k$)']);
In [9]:
data.columns
Out[9]:
Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)'], dtype='object')
Automated creating histograms using for loop¶
In [11]:
columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
plt.figure()
sns.distplot(data[i])
In [12]:
sns.kdeplot(data=data, x='Annual Income (k$)', shade=True, hue=data['Gender']);
For Females age 20-40 their is much more of them going to the mall compared to males¶
In [14]:
columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
plt.figure()
sns.kdeplot(data=data, x=i, shade=True, hue=data['Gender']);
Our outlier for the Male income is a person making 138k a year¶
In [16]:
columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
plt.figure()
sns.boxplot(data=data, x='Gender', y=data[i], hue=data['Gender']);
56% of our Mall customers are Female¶
In [18]:
data['Gender'].value_counts(normalize=True)
Out[18]:
Gender Female 0.56 Male 0.44 Name: proportion, dtype: float64
Bivariate Analysis¶
In [20]:
sns.scatterplot(data=data, x='Annual Income (k$)', y='Spending Score (1-100)')
Out[20]:
<Axes: xlabel='Annual Income (k$)', ylabel='Spending Score (1-100)'>
Quick Bivariate Clustering Analysis¶
In [240]:
#data=data.drop('Spending and Income Cluster', axis=1)
pair = sns.pairplot(data, hue='Gender')
pair.savefig("quick_analysis.png")
Annual Income of Male is slightly higher, Spending score of females are slighly higher¶
In [24]:
data.groupby('Gender')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()
Out[24]:
Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|
Gender | |||
Female | 38.098214 | 59.250000 | 51.526786 |
Male | 39.806818 | 62.227273 | 48.511364 |
In [25]:
data.corr(numeric_only=True)
Out[25]:
CustomerID | Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|---|
CustomerID | 1.000000 | -0.026763 | 0.977548 | 0.013835 |
Age | -0.026763 | 1.000000 | -0.012398 | -0.327227 |
Annual Income (k$) | 0.977548 | -0.012398 | 1.000000 | 0.009903 |
Spending Score (1-100) | 0.013835 | -0.327227 | 0.009903 | 1.000000 |
Seeing coorelation between two variables using exploratory data analysis¶
In [242]:
heat = sns.heatmap(data.corr(numeric_only=True), annot=True, cmap='coolwarm')
fig = heat.get_figure()
fig.savefig("heat_map.png")
Clustering - Univariate, Bivariate, Multivariate¶
In [92]:
clustering1 = KMeans(n_clusters=3)
In [94]:
clustering1.fit(data[['Annual Income (k$)']])
Out[94]:
KMeans(n_clusters=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=3)
In [96]:
clustering1.labels_
Out[96]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [98]:
data['Income Cluster'] = clustering1.labels_
data.head()
Out[98]:
CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Income Cluster | |
---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 1 |
1 | 2 | Male | 21 | 15 | 81 | 1 |
2 | 3 | Female | 20 | 16 | 6 | 1 |
3 | 4 | Female | 23 | 16 | 77 | 1 |
4 | 5 | Female | 31 | 17 | 40 | 1 |
Majority are in cluster 5¶
In [101]:
data['Income Cluster'].value_counts()
Out[101]:
Income Cluster 0 90 1 74 2 36 Name: count, dtype: int64
In [103]:
clustering1.inertia_
Out[103]:
23517.330930930926
In [105]:
inertia_scores=[]
for i in range(1,11):
kmeans = KMeans(n_clusters = i)
kmeans.fit(data[['Annual Income (k$)']])
inertia_scores.append(kmeans.inertia_)
In [107]:
inertia_scores
Out[107]:
[137277.28000000003, 48968.02080832332, 23528.152173913048, 15460.42951227089, 8667.679614837509, 5443.614973544974, 4109.451471234081, 3348.18475968476, 2584.7879156790923, 1762.9541125541127]
In [109]:
plt.plot(range(1,11), inertia_scores)
Out[109]:
[<matplotlib.lines.Line2D at 0x19a5a9b9e20>]
Cluster 1 has the lowest annual income, Cluster 2 has the highest spending score¶
In [113]:
data.groupby('Income Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()
Out[113]:
Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|
Income Cluster | |||
0 | 38.722222 | 67.088889 | 50.000000 |
1 | 39.500000 | 33.486486 | 50.229730 |
2 | 37.833333 | 99.888889 | 50.638889 |
Bivariate Clustering¶
In [131]:
clustering2 = KMeans(n_clusters=5)
clustering2.fit(data[['Annual Income (k$)', 'Spending Score (1-100)']])
clustering2.labels_
data['Spending and Income Cluster'] = clustering2.labels_
data.head()
Out[131]:
CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Income Cluster | Spending and Income Cluster | |
---|---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 1 | 3 |
1 | 2 | Male | 21 | 15 | 81 | 1 | 1 |
2 | 3 | Female | 20 | 16 | 6 | 1 | 3 |
3 | 4 | Female | 23 | 16 | 77 | 1 | 1 |
4 | 5 | Female | 31 | 17 | 40 | 1 | 3 |
In [133]:
inertia_scores2=[]
for i in range(1,11):
kmeans2 = KMeans(n_clusters = i)
kmeans2.fit(data[['Annual Income (k$)', 'Spending Score (1-100)']])
inertia_scores2.append(kmeans2.inertia_)
plt.plot(range(1,11), inertia_scores2)
Out[133]:
[<matplotlib.lines.Line2D at 0x19a628ce4e0>]
We find five different clusters in the Annual Income and Spending Score plot¶
In [155]:
centers = pd.DataFrame(clustering2.cluster_centers_)
centers.columns = ['x','y']
In [203]:
plt.figure(figsize=(10, 8))
plt.scatter(x=centers['x'], y=centers['y'], s=100, c='black', marker='*')
sns.scatterplot(data=data, x = 'Annual Income (k$)', y = 'Spending Score (1-100)', hue='Spending and Income Cluster', palette='tab10')
plt.savefig('clustering_bivariate.png')
Cluster 2 has the highest amount of annual income and spending score which would bring us a lot of money, 53% of them are female¶
In [159]:
pd.crosstab(data['Spending and Income Cluster'], data['Gender'], normalize='index')
Out[159]:
Gender | Female | Male |
---|---|---|
Spending and Income Cluster | ||
0 | 0.472222 | 0.527778 |
1 | 0.590909 | 0.409091 |
2 | 0.538462 | 0.461538 |
3 | 0.608696 | 0.391304 |
4 | 0.587500 | 0.412500 |
Cluster 1 has the Youngest Age, Lowest Income but a very High Spending score. This could suggest us to make campaigns for young people to spend on Video Game Consoles or Nike Jordans¶
In [162]:
data.groupby('Spending and Income Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()
Out[162]:
Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|
Spending and Income Cluster | |||
0 | 40.666667 | 87.750000 | 17.583333 |
1 | 25.272727 | 25.727273 | 79.363636 |
2 | 32.692308 | 86.538462 | 82.128205 |
3 | 45.217391 | 26.304348 | 20.913043 |
4 | 42.937500 | 55.087500 | 49.712500 |
In [165]:
# multivariate clustering
from sklearn.preprocessing import StandardScaler
In [167]:
scale = StandardScaler()
In [169]:
data.head()
Out[169]:
CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Income Cluster | Spending and Income Cluster | |
---|---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 1 | 3 |
1 | 2 | Male | 21 | 15 | 81 | 1 | 1 |
2 | 3 | Female | 20 | 16 | 6 | 1 | 3 |
3 | 4 | Female | 23 | 16 | 77 | 1 | 1 |
4 | 5 | Female | 31 | 17 | 40 | 1 | 3 |
In [173]:
dff = pd.get_dummies(data, drop_first=True)
dff.head()
Out[173]:
CustomerID | Age | Annual Income (k$) | Spending Score (1-100) | Income Cluster | Spending and Income Cluster | Gender_Male | |
---|---|---|---|---|---|---|---|
0 | 1 | 19 | 15 | 39 | 1 | 3 | True |
1 | 2 | 21 | 15 | 81 | 1 | 1 | True |
2 | 3 | 20 | 16 | 6 | 1 | 3 | False |
3 | 4 | 23 | 16 | 77 | 1 | 1 | False |
4 | 5 | 31 | 17 | 40 | 1 | 3 | False |
In [175]:
dff.columns
Out[175]:
Index(['CustomerID', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Income Cluster', 'Spending and Income Cluster', 'Gender_Male'], dtype='object')
In [181]:
dff = dff[['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Gender_Male']]
dff.head()
Out[181]:
Age | Annual Income (k$) | Spending Score (1-100) | Gender_Male | |
---|---|---|---|---|
0 | 19 | 15 | 39 | True |
1 | 21 | 15 | 81 | True |
2 | 20 | 16 | 6 | False |
3 | 23 | 16 | 77 | False |
4 | 31 | 17 | 40 | False |
In [183]:
dff = scale.fit_transform(dff)
In [189]:
dff.head()
Out[189]:
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | -1.424569 | -1.738999 | -0.434801 | 1.128152 |
1 | -1.281035 | -1.738999 | 1.195704 | 1.128152 |
2 | -1.352802 | -1.700830 | -1.715913 | -0.886405 |
3 | -1.137502 | -1.700830 | 1.040418 | -0.886405 |
4 | -0.563369 | -1.662660 | -0.395980 | -0.886405 |
In [193]:
dff = pd.DataFrame(scale.fit_transform(dff))
In [195]:
inertia_scores3=[]
for i in range(1,11):
kmeans3 = KMeans(n_clusters = i)
kmeans3.fit(dff)
inertia_scores3.append(kmeans3.inertia_)
plt.plot(range(1,11), inertia_scores3)
Out[195]:
[<matplotlib.lines.Line2D at 0x19a63ae5310>]
In [201]:
data
Out[201]:
CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | Income Cluster | Spending and Income Cluster | |
---|---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 1 | 3 |
1 | 2 | Male | 21 | 15 | 81 | 1 | 1 |
2 | 3 | Female | 20 | 16 | 6 | 1 | 3 |
3 | 4 | Female | 23 | 16 | 77 | 1 | 1 |
4 | 5 | Female | 31 | 17 | 40 | 1 | 3 |
... | ... | ... | ... | ... | ... | ... | ... |
195 | 196 | Female | 35 | 120 | 79 | 2 | 2 |
196 | 197 | Female | 45 | 126 | 28 | 2 | 0 |
197 | 198 | Male | 32 | 126 | 74 | 2 | 2 |
198 | 199 | Male | 32 | 137 | 18 | 2 | 0 |
199 | 200 | Male | 30 | 137 | 83 | 2 | 2 |
200 rows × 7 columns
In [207]:
data.to_csv('clustering.csv')