import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv('Mall_Customers.csv')

data.head()

data.describe()

sns.distplot(data['Annual Income (k$)']);

data.columns

Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')

columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.distplot(data[i])

sns.kdeplot(data=data, x='Annual Income (k$)', shade=True, hue=data['Gender']);

columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.kdeplot(data=data, x=i, shade=True, hue=data['Gender']);

columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
for i in columns:
    plt.figure()
    sns.boxplot(data=data, x='Gender', y=data[i], hue=data['Gender']);

data['Gender'].value_counts(normalize=True)

Gender
Female    0.56
Male      0.44
Name: proportion, dtype: float64

sns.scatterplot(data=data, x='Annual Income (k$)', y='Spending Score (1-100)')

<Axes: xlabel='Annual Income (k$)', ylabel='Spending Score (1-100)'>

#data=data.drop('Spending and Income Cluster', axis=1)
pair = sns.pairplot(data, hue='Gender')
pair.savefig("quick_analysis.png")

data.groupby('Gender')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()

data.corr(numeric_only=True)

heat = sns.heatmap(data.corr(numeric_only=True), annot=True, cmap='coolwarm')
fig = heat.get_figure()
fig.savefig("heat_map.png")

clustering1 = KMeans(n_clusters=3)

clustering1.fit(data[['Annual Income (k$)']])

KMeans(n_clusters=3)

KMeans(n_clusters=3)

clustering1.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

data['Income Cluster'] = clustering1.labels_
data.head()

data['Income Cluster'].value_counts()

Income Cluster
0    90
1    74
2    36
Name: count, dtype: int64

clustering1.inertia_

23517.330930930926

inertia_scores=[]
for i in range(1,11):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(data[['Annual Income (k$)']])
    inertia_scores.append(kmeans.inertia_)

inertia_scores

[137277.28000000003,
 48968.02080832332,
 23528.152173913048,
 15460.42951227089,
 8667.679614837509,
 5443.614973544974,
 4109.451471234081,
 3348.18475968476,
 2584.7879156790923,
 1762.9541125541127]

plt.plot(range(1,11), inertia_scores)

[<matplotlib.lines.Line2D at 0x19a5a9b9e20>]

data.groupby('Income Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()

clustering2 = KMeans(n_clusters=5)
clustering2.fit(data[['Annual Income (k$)', 'Spending Score (1-100)']])
clustering2.labels_
data['Spending and Income Cluster'] = clustering2.labels_
data.head()

inertia_scores2=[]
for i in range(1,11):
    kmeans2 = KMeans(n_clusters = i)
    kmeans2.fit(data[['Annual Income (k$)', 'Spending Score (1-100)']])
    inertia_scores2.append(kmeans2.inertia_)
plt.plot(range(1,11), inertia_scores2)

[<matplotlib.lines.Line2D at 0x19a628ce4e0>]

centers = pd.DataFrame(clustering2.cluster_centers_)
centers.columns = ['x','y']

plt.figure(figsize=(10, 8))
plt.scatter(x=centers['x'], y=centers['y'], s=100, c='black', marker='*')
sns.scatterplot(data=data, x = 'Annual Income (k$)', y = 'Spending Score (1-100)', hue='Spending and Income Cluster', palette='tab10')
plt.savefig('clustering_bivariate.png')

pd.crosstab(data['Spending and Income Cluster'], data['Gender'], normalize='index')

data.groupby('Spending and Income Cluster')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()

# multivariate clustering
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()

data.head()

dff = pd.get_dummies(data, drop_first=True)
dff.head()

dff.columns

Index(['CustomerID', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)',
       'Income Cluster', 'Spending and Income Cluster', 'Gender_Male'],
      dtype='object')

dff = dff[['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Gender_Male']]
dff.head()

dff = scale.fit_transform(dff)

dff.head()

dff = pd.DataFrame(scale.fit_transform(dff))

inertia_scores3=[]
for i in range(1,11):
    kmeans3 = KMeans(n_clusters = i)
    kmeans3.fit(dff)
    inertia_scores3.append(kmeans3.inertia_)
plt.plot(range(1,11), inertia_scores3)

[<matplotlib.lines.Line2D at 0x19a63ae5310>]

data

data.to_csv('clustering.csv')

	CustomerID	Age	Annual Income (k$)	Spending Score (1-100)
count	200.000000	200.000000	200.000000	200.000000
mean	100.500000	38.850000	60.560000	50.200000
std	57.879185	13.969007	26.264721	25.823522
min	1.000000	18.000000	15.000000	1.000000
25%	50.750000	28.750000	41.500000	34.750000
50%	100.500000	36.000000	61.500000	50.000000
75%	150.250000	49.000000	78.000000	73.000000
max	200.000000	70.000000	137.000000	99.000000

	CustomerID	Age	Annual Income (k$)	Spending Score (1-100)
CustomerID	1.000000	-0.026763	0.977548	0.013835
Age	-0.026763	1.000000	-0.012398	-0.327227
Annual Income (k$)	0.977548	-0.012398	1.000000	0.009903
Spending Score (1-100)	0.013835	-0.327227	0.009903	1.000000

	Age	Annual Income (k$)	Spending Score (1-100)
Income Cluster
0	38.722222	67.088889	50.000000
1	39.500000	33.486486	50.229730
2	37.833333	99.888889	50.638889

Gender	Female	Male
Spending and Income Cluster
0	0.472222	0.527778
1	0.590909	0.409091
2	0.538462	0.461538
3	0.608696	0.391304
4	0.587500	0.412500

	Age	Annual Income (k$)	Spending Score (1-100)
Spending and Income Cluster
0	40.666667	87.750000	17.583333
1	25.272727	25.727273	79.363636
2	32.692308	86.538462	82.128205
3	45.217391	26.304348	20.913043
4	42.937500	55.087500	49.712500

Univariate Analysis¶

The average income of a mall goer is 60,560¶

Median Income of a person going to the mall is 61,500¶

The youngest person going to the mall is 18 and the oldest person going to the mall is 70¶

We find a normal distribution on this histogram¶

Automated creating histograms using for loop¶

For Females age 20-40 their is much more of them going to the mall compared to males¶

Our outlier for the Male income is a person making 138k a year¶

56% of our Mall customers are Female¶

Bivariate Analysis¶

Quick Bivariate Clustering Analysis¶

Annual Income of Male is slightly higher, Spending score of females are slighly higher¶

Seeing coorelation between two variables using exploratory data analysis¶

Clustering - Univariate, Bivariate, Multivariate¶

Majority are in cluster 5¶

Cluster 1 has the lowest annual income, Cluster 2 has the highest spending score¶

Bivariate Clustering¶

We find five different clusters in the Annual Income and Spending Score plot¶

Cluster 2 has the highest amount of annual income and spending score which would bring us a lot of money, 53% of them are female¶

Cluster 1 has the Youngest Age, Lowest Income but a very High Spending score. This could suggest us to make campaigns for young people to spend on Video Game Consoles or Nike Jordans¶

	Age	Annual Income (k$)	Spending Score (1-100)
Gender
Female	38.098214	59.250000	51.526786
Male	39.806818	62.227273	48.511364

	CustomerID	Age	Annual Income (k$)	Spending Score (1-100)	Income Cluster	Spending and Income Cluster	Gender_Male
0	1	19	15	39	1	3	True
1	2	21	15	81	1	1	True
2	3	20	16	6	1	3	False
3	4	23	16	77	1	1	False
4	5	31	17	40	1	3	False

	0	1	2	3
0	-1.424569	-1.738999	-0.434801	1.128152
1	-1.281035	-1.738999	1.195704	1.128152
2	-1.352802	-1.700830	-1.715913	-0.886405
3	-1.137502	-1.700830	1.040418	-0.886405
4	-0.563369	-1.662660	-0.395980	-0.886405