In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
In [2]:
data = pd.read_csv('Mall_Customers.csv')
In [3]:
data.head()
Out[3]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40

Univariate Analysis¶

The average income of a mall goer is 60,560¶

Median Income of a person going to the mall is 61,500¶

The youngest person going to the mall is 18 and the oldest person going to the mall is 70¶

In [6]:
data.describe()
Out[6]:
CustomerID Age Annual Income (k$) Spending Score (1-100)
count 200.000000 200.000000 200.000000 200.000000
mean 100.500000 38.850000 60.560000 50.200000
std 57.879185 13.969007 26.264721 25.823522
min 1.000000 18.000000 15.000000 1.000000
25% 50.750000 28.750000 41.500000 34.750000
50% 100.500000 36.000000 61.500000 50.000000
75% 150.250000 49.000000 78.000000 73.000000
max 200.000000 70.000000 137.000000 99.000000

We find a normal distribution on this histogram¶

In [246]:
sns.distplot(data['Annual Income (k$)']);
No description has been provided for this image
In [9]:
data.columns
Out[9]:
Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')

Automated creating histograms using for loop¶

In [11]:
columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.distplot(data[i])
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [12]:
sns.kdeplot(data=data, x='Annual Income (k$)', shade=True, hue=data['Gender']);
No description has been provided for this image

For Females age 20-40 their is much more of them going to the mall compared to males¶

In [14]:
columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.kdeplot(data=data, x=i, shade=True, hue=data['Gender']);
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Our outlier for the Male income is a person making 138k a year¶

In [16]:
columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.boxplot(data=data, x='Gender', y=data[i], hue=data['Gender']);
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

56% of our Mall customers are Female¶

In [18]:
data['Gender'].value_counts(normalize=True)
Out[18]:
Gender
Female    0.56
Male      0.44
Name: proportion, dtype: float64

Bivariate Analysis¶

In [20]:
sns.scatterplot(data=data, x='Annual Income (k$)', y='Spending Score (1-100)')
Out[20]:
<Axes: xlabel='Annual Income (k$)', ylabel='Spending Score (1-100)'>
No description has been provided for this image

Quick Bivariate Clustering Analysis¶

In [240]:
#data=data.drop('Spending and Income Cluster', axis=1)
pair = sns.pairplot(data, hue='Gender')
pair.savefig("quick_analysis.png") 
No description has been provided for this image

Annual Income of Male is slightly higher, Spending score of females are slighly higher¶

In [24]:
data.groupby('Gender')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()
Out[24]:
Age Annual Income (k$) Spending Score (1-100)
Gender
Female 38.098214 59.250000 51.526786
Male 39.806818 62.227273 48.511364
In [25]:
data.corr(numeric_only=True)
Out[25]:
CustomerID Age Annual Income (k$) Spending Score (1-100)
CustomerID 1.000000 -0.026763 0.977548 0.013835
Age -0.026763 1.000000 -0.012398 -0.327227
Annual Income (k$) 0.977548 -0.012398 1.000000 0.009903
Spending Score (1-100) 0.013835 -0.327227 0.009903 1.000000

Seeing coorelation between two variables using exploratory data analysis¶

In [242]:
heat = sns.heatmap(data.corr(numeric_only=True), annot=True, cmap='coolwarm')
fig = heat.get_figure()
fig.savefig("heat_map.png") 
No description has been provided for this image

Clustering - Univariate, Bivariate, Multivariate¶

In [92]:
clustering1 = KMeans(n_clusters=3)
In [94]:
clustering1.fit(data[['Annual Income (k$)']])
Out[94]:
KMeans(n_clusters=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=3)
In [96]:
clustering1.labels_
Out[96]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])
In [98]:
data['Income Cluster'] = clustering1.labels_
data.head()
Out[98]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100) Income Cluster
0 1 Male 19 15 39 1
1 2 Male 21 15 81 1
2 3 Female 20 16 6 1
3 4 Female 23 16 77 1
4 5 Female 31 17 40 1

Majority are in cluster 5¶

In [101]:
data['Income Cluster'].value_counts()
Out[101]:
Income Cluster
0    90
1    74
2    36
Name: count, dtype: int64
In [103]:
clustering1.inertia_
Out[103]:
23517.330930930926
In [105]:
inertia_scores=[]
for i in range(1,11):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(data[['Annual Income (k$)']])
    inertia_scores.append(kmeans.inertia_)
In [107]:
inertia_scores
Out[107]:
[137277.28000000003,
 48968.02080832332,
 23528.152173913048,
 15460.42951227089,
 8667.679614837509,
 5443.614973544974,
 4109.451471234081,
 3348.18475968476,
 2584.7879156790923,
 1762.9541125541127]
In [109]:
plt.plot(range(1,11), inertia_scores)
Out[109]:
[<matplotlib.lines.Line2D at 0x19a5a9b9e20>]
No description has been provided for this image

Cluster 1 has the lowest annual income, Cluster 2 has the highest spending score¶

In [113]:
data.groupby('Income Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()
Out[113]:
Age Annual Income (k$) Spending Score (1-100)
Income Cluster
0 38.722222 67.088889 50.000000
1 39.500000 33.486486 50.229730
2 37.833333 99.888889 50.638889

Bivariate Clustering¶

In [131]:
clustering2 = KMeans(n_clusters=5)
clustering2.fit(data[['Annual Income (k$)', 'Spending Score (1-100)']])
clustering2.labels_
data['Spending and Income Cluster'] = clustering2.labels_
data.head()
Out[131]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100) Income Cluster Spending and Income Cluster
0 1 Male 19 15 39 1 3
1 2 Male 21 15 81 1 1
2 3 Female 20 16 6 1 3
3 4 Female 23 16 77 1 1
4 5 Female 31 17 40 1 3
In [133]:
inertia_scores2=[]
for i in range(1,11):
    kmeans2 = KMeans(n_clusters = i)
    kmeans2.fit(data[['Annual Income (k$)', 'Spending Score (1-100)']])
    inertia_scores2.append(kmeans2.inertia_)
plt.plot(range(1,11), inertia_scores2)
Out[133]:
[<matplotlib.lines.Line2D at 0x19a628ce4e0>]
No description has been provided for this image

We find five different clusters in the Annual Income and Spending Score plot¶

In [155]:
centers = pd.DataFrame(clustering2.cluster_centers_)
centers.columns = ['x','y']
In [203]:
plt.figure(figsize=(10, 8))
plt.scatter(x=centers['x'], y=centers['y'], s=100, c='black', marker='*')
sns.scatterplot(data=data, x = 'Annual Income (k$)', y = 'Spending Score (1-100)', hue='Spending and Income Cluster', palette='tab10')
plt.savefig('clustering_bivariate.png')
No description has been provided for this image

Cluster 2 has the highest amount of annual income and spending score which would bring us a lot of money, 53% of them are female¶

In [159]:
pd.crosstab(data['Spending and Income Cluster'], data['Gender'], normalize='index')
Out[159]:
Gender Female Male
Spending and Income Cluster
0 0.472222 0.527778
1 0.590909 0.409091
2 0.538462 0.461538
3 0.608696 0.391304
4 0.587500 0.412500

Cluster 1 has the Youngest Age, Lowest Income but a very High Spending score. This could suggest us to make campaigns for young people to spend on Video Game Consoles or Nike Jordans¶

In [162]:
data.groupby('Spending and Income Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()
Out[162]:
Age Annual Income (k$) Spending Score (1-100)
Spending and Income Cluster
0 40.666667 87.750000 17.583333
1 25.272727 25.727273 79.363636
2 32.692308 86.538462 82.128205
3 45.217391 26.304348 20.913043
4 42.937500 55.087500 49.712500
In [165]:
# multivariate clustering
from sklearn.preprocessing import StandardScaler
In [167]:
scale = StandardScaler()
In [169]:
data.head()
Out[169]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100) Income Cluster Spending and Income Cluster
0 1 Male 19 15 39 1 3
1 2 Male 21 15 81 1 1
2 3 Female 20 16 6 1 3
3 4 Female 23 16 77 1 1
4 5 Female 31 17 40 1 3
In [173]:
dff = pd.get_dummies(data, drop_first=True)
dff.head()
Out[173]:
CustomerID Age Annual Income (k$) Spending Score (1-100) Income Cluster Spending and Income Cluster Gender_Male
0 1 19 15 39 1 3 True
1 2 21 15 81 1 1 True
2 3 20 16 6 1 3 False
3 4 23 16 77 1 1 False
4 5 31 17 40 1 3 False
In [175]:
dff.columns
Out[175]:
Index(['CustomerID', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)',
       'Income Cluster', 'Spending and Income Cluster', 'Gender_Male'],
      dtype='object')
In [181]:
dff = dff[['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Gender_Male']]
dff.head()
Out[181]:
Age Annual Income (k$) Spending Score (1-100) Gender_Male
0 19 15 39 True
1 21 15 81 True
2 20 16 6 False
3 23 16 77 False
4 31 17 40 False
In [183]:
dff = scale.fit_transform(dff)
In [189]:
dff.head()
Out[189]:
0 1 2 3
0 -1.424569 -1.738999 -0.434801 1.128152
1 -1.281035 -1.738999 1.195704 1.128152
2 -1.352802 -1.700830 -1.715913 -0.886405
3 -1.137502 -1.700830 1.040418 -0.886405
4 -0.563369 -1.662660 -0.395980 -0.886405
In [193]:
dff = pd.DataFrame(scale.fit_transform(dff))
In [195]:
inertia_scores3=[]
for i in range(1,11):
    kmeans3 = KMeans(n_clusters = i)
    kmeans3.fit(dff)
    inertia_scores3.append(kmeans3.inertia_)
plt.plot(range(1,11), inertia_scores3)
Out[195]:
[<matplotlib.lines.Line2D at 0x19a63ae5310>]
No description has been provided for this image
In [201]:
data
Out[201]:
CustomerID Gender Age Annual Income (k$) Spending Score (1-100) Income Cluster Spending and Income Cluster
0 1 Male 19 15 39 1 3
1 2 Male 21 15 81 1 1
2 3 Female 20 16 6 1 3
3 4 Female 23 16 77 1 1
4 5 Female 31 17 40 1 3
... ... ... ... ... ... ... ...
195 196 Female 35 120 79 2 2
196 197 Female 45 126 28 2 0
197 198 Male 32 126 74 2 2
198 199 Male 32 137 18 2 0
199 200 Male 30 137 83 2 2

200 rows × 7 columns

In [207]:
data.to_csv('clustering.csv')