#-----------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------[ Libraries Imports ]--------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
import pandas as pd
from sklearn import preprocessing
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
#-----------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------[ Import DataBase ]-------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
dataTelecom=pd.read_excel('S:\\TunisieTelecom\\TelecomData.xlsx')
dataTelecom.head(100)
STATUT | OFFRE | ANC_M | HANDSET | revenu_voix | revenu_inter | NB_JOUR_ACTIVITE_TAXE | NB_JOUR_APPEL_TAXE | DUREE_APPEL_TOT | DUREE_APPEL_TAXEE | ... | FREQ_USSD_VOIX | FREQ_USSD_SMS | VOLUME_SESSION | VOLUME_SESSION_WEEKEND | REVENU_VAS | ARPU | P_revenu_data | P_revenu_voix_c | P_revenu_vas_c | id_client | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Active | Offre30 | 123 | 2G | 12.709375 | 1.464447 | 10.366042 | 10.658842 | 27.261836 | 2.125971 | ... | 28.679570 | 146.930634 | 200000.000000 | 54.025577 | 1.059822 | 4.325398 | 0.136685 | 0.943095 | 0.279541 | 1.0 |
1 | Active | Offre8 | 98 | 2G | 3.000000 | 0.756078 | 1.330736 | 7.079320 | 19.883099 | 7.735475 | ... | 28.490597 | 169.219363 | 3337.992419 | 24.789260 | 1.411803 | 2.677563 | 0.533695 | 0.540374 | 0.712010 | 2.0 |
2 | Active | Offre24 | 90 | 4G | 32.514156 | 0.681197 | 0.158160 | 4.367702 | 133.476368 | 7.681088 | ... | 190.000000 | 44.403308 | 107082.775926 | 160.031496 | 18.195224 | 1.954007 | 0.799606 | 0.071368 | 0.376754 | 3.0 |
3 | Active | Offre10 | 226 | 2G | 3.821551 | 5.265345 | 4.003452 | 3.086766 | 35.053364 | 8.540951 | ... | 19.412661 | 190.000000 | 128700.752169 | 102.668979 | 17.746873 | 2.496790 | 0.426526 | 0.314082 | 0.676112 | 4.0 |
4 | Active | Offre1 | 139 | 2G | 60.009385 | 1.957144 | 5.689241 | 0.403300 | 0.844956 | 12.355853 | ... | 81.567146 | 6.365788 | 75654.384291 | 0.341570 | 7.280861 | 2.291381 | 0.319736 | 0.600057 | 0.719450 | 5.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
95 | Active | Offre8 | 159 | 2G | 3.000000 | 2.250293 | 21.961465 | 2.872333 | 1.826910 | 27.907662 | ... | 12.954468 | 23.226754 | 32664.259485 | 21.276030 | 6.134130 | 0.689345 | 0.857627 | 0.392768 | 0.657034 | 96.0 |
96 | Suspended | Offre2 | 145 | 2G | 3.000000 | 3.473084 | 3.575555 | 0.298674 | 80.858548 | 5.349497 | ... | 153.474792 | 190.000000 | 82909.748831 | 26.213769 | 1.504337 | 0.164892 | 0.980014 | 0.467268 | 0.150409 | 97.0 |
97 | Active | Offre10 | 185 | 2G | 4.531545 | 2.142349 | 3.470616 | 6.400014 | 107.330103 | 5.199869 | ... | 157.487581 | 36.176280 | 141996.977868 | 42.321968 | 1.637815 | 1.848815 | 0.772398 | 0.407095 | 0.545033 | 98.0 |
98 | Active | Offre15 | 197 | 3G | 11.762926 | 3.474145 | 2.842419 | 10.629819 | 86.011908 | 0.124836 | ... | 159.332900 | 77.858228 | 12052.670191 | 46.685004 | 1.422138 | 1.944146 | 0.152620 | 0.243845 | 0.387906 | 99.0 |
99 | Suspended | Offre30 | 178 | 2G | 17.606096 | 0.019640 | 1.850818 | 0.184009 | 89.864928 | 5.858613 | ... | 18.339092 | 47.120168 | 134081.459601 | 10.649038 | 0.640634 | 2.204618 | 0.893336 | 0.293854 | 0.121368 | 100.0 |
100 rows × 64 columns
# selection of the most important variables for each clustering
selected_variables = [
'FREQ_USSD', 'revenu_cdr_c','DUREE_APPEL_TOT','MNT_RECH','VOLUME_SESSION', 'MNT_FORFAIT_DATA','FREQ_ACT_OUT'
]
# Select the desired variables from the dataframe
dataTelecomSelected = dataTelecom[selected_variables].copy()
dataTelecomSelectedSample=dataTelecomSelected.sample(n=1000)
KmeansData = dataTelecomSelectedSample.copy()
dataTelecomSelectedSample.head(1000)
FREQ_USSD | revenu_cdr_c | DUREE_APPEL_TOT | MNT_RECH | VOLUME_SESSION | MNT_FORFAIT_DATA | FREQ_ACT_OUT | |
---|---|---|---|---|---|---|---|
8272 | 117.715662 | 3.609806 | 132.448822 | 9.867506 | 170666.351197 | 19.796220 | 4.163475 |
27269 | 51.607567 | 25.120910 | 96.453174 | 21.567470 | 150130.855066 | 7.514821 | 2.672553 |
67156 | 81.423246 | 15.146389 | 101.062875 | 5.264629 | 15861.113848 | 0.865813 | 1.686709 |
57271 | 41.336494 | 5.563921 | 6.133606 | 19.266939 | 38589.945279 | 0.245570 | 2.135797 |
31677 | 87.387539 | 8.839919 | 401.105486 | 1.699672 | 135452.180362 | 1.941672 | 12.948769 |
... | ... | ... | ... | ... | ... | ... | ... |
21195 | 3.678090 | 6.354561 | 34.260661 | 4.839633 | 42362.792746 | 5.064106 | 8.070536 |
41095 | 81.029405 | 28.014919 | 45.847492 | 9.183478 | 7338.678331 | 31.952191 | 15.306484 |
75243 | 93.898015 | 6.523911 | 8.333489 | 4.737723 | 117516.039349 | 7.677401 | 2.275896 |
26227 | 7.331037 | 19.156340 | 172.246763 | 0.787336 | 76963.666513 | 0.155585 | 2.170419 |
24233 | 102.938425 | 4.201764 | 335.964029 | 15.895931 | 200000.000000 | 0.186818 | 13.451801 |
1000 rows × 7 columns
#-----------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------[ Clustering ]------------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
#-------------------------------------> Elbow Method
# Standardize the data to have zero mean and unit variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(KmeansData)
# Determine the optimal number of clusters using the Elbow Method
inertia = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
inertia.append(kmeans.inertia_)
# Plot the Elbow Method to find the optimal number of clusters
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-cluster Sum of Squares (Inertia)')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()
# K =6
# Perform k-means clustering with the selected number of clusters
k = 6
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)
# Add the cluster labels to the original DataFrame
KmeansData['Cluster'] = cluster_labels
# Explore the cluster characteristics (e.g., mean values of each variable per cluster)
cluster_characteristics = KmeansData.groupby('Cluster')[selected_variables].mean()
# Print the cluster characteristics
print(cluster_characteristics)
# You can further analyze and interpret the clusters to gain insights into customer segments based on usage behavior.
#Higher Silhouette scores indicate better-defined clusters.
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print("Average Silhouette Score:", silhouette_avg)
Average Silhouette Score: 0.1718211576611401
# ---> CAH
plt.figure(figsize=(20, 16))
#générer la matrice des liens
Z = linkage(dataTelecomSelectedSample,method='ward', metric='euclidean')
plt.title("CAH")
dendrogram(Z,labels=dataTelecomSelectedSample.index,orientation='left',color_threshold=0)
plt.show()
# ---> Centrage et Reduction
scaler = preprocessing.StandardScaler()
dataTelecomSelectedSample_scaled = scaler.fit_transform(dataTelecomSelectedSample)
dataTelecomSelectedSample_scaled = pd.DataFrame(dataTelecomSelectedSample_scaled, index=dataTelecomSelectedSample.index, columns=dataTelecomSelectedSample.columns)
print('X_scaled', dataTelecomSelectedSample_scaled.shape)
X_scaled (1000, 7)
# Calculate the interval length
max_value = dataTelecomSelectedSample['revenu_cdr_c'].max()
interval_length = max_value / 5
# Define the intervals
intervals=[-0.1,5,10,16,21,217] # ---> ] .. , ..]
#intervals = np.arange(-1, max_value + interval_length , interval_length) # Adding a small value for inclusiveness
# Assign interval labels
labels = ['Interval 1', 'Interval 2', 'Interval 3', 'Interval 4', 'Interval 5']
# Transform column values into intervals
dataTelecomSelectedSample['Intervals'] = pd.cut(dataTelecomSelectedSample['revenu_cdr_c'], bins=intervals, labels=labels, right=True)
dataTelecomSelectedSample.head(1000)
FREQ_USSD | revenu_cdr_c | DUREE_APPEL_TOT | MNT_RECH | VOLUME_SESSION | MNT_FORFAIT_DATA | FREQ_ACT_OUT | Intervals | |
---|---|---|---|---|---|---|---|---|
8272 | 117.715662 | 3.609806 | 132.448822 | 9.867506 | 170666.351197 | 19.796220 | 4.163475 | Interval 1 |
27269 | 51.607567 | 25.120910 | 96.453174 | 21.567470 | 150130.855066 | 7.514821 | 2.672553 | Interval 5 |
67156 | 81.423246 | 15.146389 | 101.062875 | 5.264629 | 15861.113848 | 0.865813 | 1.686709 | Interval 3 |
57271 | 41.336494 | 5.563921 | 6.133606 | 19.266939 | 38589.945279 | 0.245570 | 2.135797 | Interval 2 |
31677 | 87.387539 | 8.839919 | 401.105486 | 1.699672 | 135452.180362 | 1.941672 | 12.948769 | Interval 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
21195 | 3.678090 | 6.354561 | 34.260661 | 4.839633 | 42362.792746 | 5.064106 | 8.070536 | Interval 2 |
41095 | 81.029405 | 28.014919 | 45.847492 | 9.183478 | 7338.678331 | 31.952191 | 15.306484 | Interval 5 |
75243 | 93.898015 | 6.523911 | 8.333489 | 4.737723 | 117516.039349 | 7.677401 | 2.275896 | Interval 2 |
26227 | 7.331037 | 19.156340 | 172.246763 | 0.787336 | 76963.666513 | 0.155585 | 2.170419 | Interval 4 |
24233 | 102.938425 | 4.201764 | 335.964029 | 15.895931 | 200000.000000 | 0.186818 | 13.451801 | Interval 1 |
1000 rows × 8 columns
y = dataTelecomSelectedSample['Intervals']
print('y', y.shape, y.unique()) # correle -> 1 bark -> on choisit celles dans correlation est loin
y (1000,) ['Interval 1', 'Interval 5', 'Interval 3', 'Interval 2', 'Interval 4'] Categories (5, object): ['Interval 1' < 'Interval 2' < 'Interval 3' < 'Interval 4' < 'Interval 5']
class_color = {'Interval 1': 'red', 'Interval 2': 'yellow', 'Interval 3': 'green','Interval 4':'blue','Interval 5':'violet','Interval 6':'white'}
#class_color = {'Interval 1': '#F3FF83', 'Interval 2': '#A0D287', 'Interval 3': '#4BA48D','Interval 4':'#05728A','Interval 5':'#1D4060'}
y_color = [class_color[c] for c in y]
print('y', y.shape, y.unique())
y (1000,) ['Interval 1', 'Interval 5', 'Interval 3', 'Interval 2', 'Interval 4'] Categories (5, object): ['Interval 1' < 'Interval 2' < 'Interval 3' < 'Interval 4' < 'Interval 5']
metric = 'euclidean'
method = 'ward'
cmap = 'coolwarm'
clustergrid = sns.clustermap(dataTelecomSelectedSample_scaled.T, figsize=(32, 20), metric=metric, method=method, cmap=cmap,
row_cluster=True, col_cluster=True, col_colors=y_color,
center=0.0, vmin=-4.8, vmax=4.8)
# Increase the font size of the column labels on the right
clustergrid.ax_heatmap.yaxis.tick_right()
clustergrid.ax_heatmap.yaxis.set_tick_params(labelsize=14) # Adjust the font size as desired
#-1,4,9,183.9999,185
# Set the title
# intervals=[-0.1,5,10,16,21,217] # ---> ] .. , ..]
title = "Class = revenu_cdr \n I1:]-0.1->5] -> red , I2:]5->10] -> yellow , I3:]10->16] -> green , I4]16-> 21] -> blue , I5]21-> 217] -> violet"
title_obj = clustergrid.fig.suptitle(title, fontsize=40, y=0.43,x=0.55, fontweight='bold')
# Add a border to the title
title_obj.set_bbox({"facecolor": "white", "edgecolor": "black", "linewidth": 2})
clustergrid.savefig('CAH.png')
#clustergrid.fig.suptitle(title, fontsize=40, y=0.45,x=0.65,color="#17B79C",fontstyle="oblique")
# Interval 1 -> red , Interval 2 -> orange , Interval 3 -> yellow , Interval 4 -> green , Interval 5 -> blue
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print("Average Silhouette Score:", silhouette_avg)
Average Silhouette Score: 0.1718211576611401
#-------------------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------[ Evaluation ]-------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
from sklearn.metrics import calinski_harabasz_score
kmeans_labels = KmeansData['Cluster']
cah_cluster_labels = fcluster(Z, 999, criterion='distance')
calinski_kmeans = calinski_harabasz_score(X_scaled, kmeans_labels)
calinski_cah = calinski_harabasz_score(X_scaled, cah_cluster_labels)
print("Calinski-Harabasz Index - K-Means:", calinski_kmeans)
print("Calinski-Harabasz Index - CAH:", calinski_cah)
Calinski-Harabasz Index - K-Means: 130.28429037890933 Calinski-Harabasz Index - CAH: 2.0652054805288764
# "Calinski-Harabasz Index - K-Means: 130.284" means that, when using the K-Means clustering algorithm, the index value is 2.065. This value indicates the quality of the clusters obtained with K-Means. A higher value suggests that the data points are well-separated into distinct clusters.
#
# "Calinski-Harabasz Index - CAH: 861.5468568011702" means that, when using the Complete Agglomerative Hierarchical (CAH) clustering algorithm, the index value is 861.5468568011702. This value indicates the quality of the clusters obtained with CAH. A higher value suggests that the clusters formed through hierarchical clustering are well-separated and well-defined.
#
# In both cases, a higher Calinski-Harabasz Index is generally desirable because it indicates a better separation of clusters. However, the specific interpretation of the values can vary depending on the dataset and the problem you are trying to solve. It's often used as a relative measure, comparing different clustering solutions to choose the one with the highest index.