In [1]:
#-----------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------[ Libraries Imports ]--------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
In [2]:
import pandas as pd
from sklearn import preprocessing
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
In [3]:
#-----------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------[ Import DataBase ]-------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
In [4]:
dataTelecom=pd.read_excel('S:\\TunisieTelecom\\TelecomData.xlsx')
dataTelecom.head(100)
Out[4]:
STATUT OFFRE ANC_M HANDSET revenu_voix revenu_inter NB_JOUR_ACTIVITE_TAXE NB_JOUR_APPEL_TAXE DUREE_APPEL_TOT DUREE_APPEL_TAXEE ... FREQ_USSD_VOIX FREQ_USSD_SMS VOLUME_SESSION VOLUME_SESSION_WEEKEND REVENU_VAS ARPU P_revenu_data P_revenu_voix_c P_revenu_vas_c id_client
0 Active Offre30 123 2G 12.709375 1.464447 10.366042 10.658842 27.261836 2.125971 ... 28.679570 146.930634 200000.000000 54.025577 1.059822 4.325398 0.136685 0.943095 0.279541 1.0
1 Active Offre8 98 2G 3.000000 0.756078 1.330736 7.079320 19.883099 7.735475 ... 28.490597 169.219363 3337.992419 24.789260 1.411803 2.677563 0.533695 0.540374 0.712010 2.0
2 Active Offre24 90 4G 32.514156 0.681197 0.158160 4.367702 133.476368 7.681088 ... 190.000000 44.403308 107082.775926 160.031496 18.195224 1.954007 0.799606 0.071368 0.376754 3.0
3 Active Offre10 226 2G 3.821551 5.265345 4.003452 3.086766 35.053364 8.540951 ... 19.412661 190.000000 128700.752169 102.668979 17.746873 2.496790 0.426526 0.314082 0.676112 4.0
4 Active Offre1 139 2G 60.009385 1.957144 5.689241 0.403300 0.844956 12.355853 ... 81.567146 6.365788 75654.384291 0.341570 7.280861 2.291381 0.319736 0.600057 0.719450 5.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 Active Offre8 159 2G 3.000000 2.250293 21.961465 2.872333 1.826910 27.907662 ... 12.954468 23.226754 32664.259485 21.276030 6.134130 0.689345 0.857627 0.392768 0.657034 96.0
96 Suspended Offre2 145 2G 3.000000 3.473084 3.575555 0.298674 80.858548 5.349497 ... 153.474792 190.000000 82909.748831 26.213769 1.504337 0.164892 0.980014 0.467268 0.150409 97.0
97 Active Offre10 185 2G 4.531545 2.142349 3.470616 6.400014 107.330103 5.199869 ... 157.487581 36.176280 141996.977868 42.321968 1.637815 1.848815 0.772398 0.407095 0.545033 98.0
98 Active Offre15 197 3G 11.762926 3.474145 2.842419 10.629819 86.011908 0.124836 ... 159.332900 77.858228 12052.670191 46.685004 1.422138 1.944146 0.152620 0.243845 0.387906 99.0
99 Suspended Offre30 178 2G 17.606096 0.019640 1.850818 0.184009 89.864928 5.858613 ... 18.339092 47.120168 134081.459601 10.649038 0.640634 2.204618 0.893336 0.293854 0.121368 100.0

100 rows × 64 columns

In [5]:
# selection of the most important variables for each clustering
selected_variables = [
    'FREQ_USSD', 'revenu_cdr_c','DUREE_APPEL_TOT','MNT_RECH','VOLUME_SESSION', 'MNT_FORFAIT_DATA','FREQ_ACT_OUT'
]

# Select the desired variables from the dataframe
dataTelecomSelected = dataTelecom[selected_variables].copy()
In [6]:
dataTelecomSelectedSample=dataTelecomSelected.sample(n=1000)
KmeansData = dataTelecomSelectedSample.copy()
In [7]:
dataTelecomSelectedSample.head(1000)
Out[7]:
FREQ_USSD revenu_cdr_c DUREE_APPEL_TOT MNT_RECH VOLUME_SESSION MNT_FORFAIT_DATA FREQ_ACT_OUT
8272 117.715662 3.609806 132.448822 9.867506 170666.351197 19.796220 4.163475
27269 51.607567 25.120910 96.453174 21.567470 150130.855066 7.514821 2.672553
67156 81.423246 15.146389 101.062875 5.264629 15861.113848 0.865813 1.686709
57271 41.336494 5.563921 6.133606 19.266939 38589.945279 0.245570 2.135797
31677 87.387539 8.839919 401.105486 1.699672 135452.180362 1.941672 12.948769
... ... ... ... ... ... ... ...
21195 3.678090 6.354561 34.260661 4.839633 42362.792746 5.064106 8.070536
41095 81.029405 28.014919 45.847492 9.183478 7338.678331 31.952191 15.306484
75243 93.898015 6.523911 8.333489 4.737723 117516.039349 7.677401 2.275896
26227 7.331037 19.156340 172.246763 0.787336 76963.666513 0.155585 2.170419
24233 102.938425 4.201764 335.964029 15.895931 200000.000000 0.186818 13.451801

1000 rows × 7 columns

In [8]:
#-----------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------[ Clustering ]------------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
In [9]:
#-------------------------------------> Elbow Method
In [10]:
# Standardize the data to have zero mean and unit variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(KmeansData)

# Determine the optimal number of clusters using the Elbow Method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Method to find the optimal number of clusters
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-cluster Sum of Squares (Inertia)')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()
In [11]:
# K =6
# Perform k-means clustering with the selected number of clusters
k = 6
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add the cluster labels to the original DataFrame
KmeansData['Cluster'] = cluster_labels

# Explore the cluster characteristics (e.g., mean values of each variable per cluster)
cluster_characteristics = KmeansData.groupby('Cluster')[selected_variables].mean()

# Print the cluster characteristics
print(cluster_characteristics)

# You can further analyze and interpret the clusters to gain insights into customer segments based on usage behavior.
In [12]:
#Higher Silhouette scores indicate better-defined clusters.
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print("Average Silhouette Score:", silhouette_avg)
Average Silhouette Score: 0.1718211576611401
In [13]:
# ---> CAH
plt.figure(figsize=(20, 16))
#générer la matrice des liens

Z = linkage(dataTelecomSelectedSample,method='ward', metric='euclidean')

plt.title("CAH")

dendrogram(Z,labels=dataTelecomSelectedSample.index,orientation='left',color_threshold=0)

plt.show()
In [14]:
# ---> Centrage et Reduction
In [15]:
scaler = preprocessing.StandardScaler()
dataTelecomSelectedSample_scaled = scaler.fit_transform(dataTelecomSelectedSample)
dataTelecomSelectedSample_scaled = pd.DataFrame(dataTelecomSelectedSample_scaled, index=dataTelecomSelectedSample.index, columns=dataTelecomSelectedSample.columns)
print('X_scaled', dataTelecomSelectedSample_scaled.shape)
X_scaled (1000, 7)
In [16]:
# Calculate the interval length
max_value = dataTelecomSelectedSample['revenu_cdr_c'].max()
interval_length = max_value / 5

# Define the intervals
intervals=[-0.1,5,10,16,21,217] # ---> ] .. , ..]
#intervals = np.arange(-1, max_value + interval_length , interval_length)  # Adding a small value for inclusiveness

# Assign interval labels
labels = ['Interval 1', 'Interval 2', 'Interval 3', 'Interval 4', 'Interval 5']

# Transform column values into intervals
dataTelecomSelectedSample['Intervals'] = pd.cut(dataTelecomSelectedSample['revenu_cdr_c'], bins=intervals, labels=labels, right=True)

dataTelecomSelectedSample.head(1000)
Out[16]:
FREQ_USSD revenu_cdr_c DUREE_APPEL_TOT MNT_RECH VOLUME_SESSION MNT_FORFAIT_DATA FREQ_ACT_OUT Intervals
8272 117.715662 3.609806 132.448822 9.867506 170666.351197 19.796220 4.163475 Interval 1
27269 51.607567 25.120910 96.453174 21.567470 150130.855066 7.514821 2.672553 Interval 5
67156 81.423246 15.146389 101.062875 5.264629 15861.113848 0.865813 1.686709 Interval 3
57271 41.336494 5.563921 6.133606 19.266939 38589.945279 0.245570 2.135797 Interval 2
31677 87.387539 8.839919 401.105486 1.699672 135452.180362 1.941672 12.948769 Interval 2
... ... ... ... ... ... ... ... ...
21195 3.678090 6.354561 34.260661 4.839633 42362.792746 5.064106 8.070536 Interval 2
41095 81.029405 28.014919 45.847492 9.183478 7338.678331 31.952191 15.306484 Interval 5
75243 93.898015 6.523911 8.333489 4.737723 117516.039349 7.677401 2.275896 Interval 2
26227 7.331037 19.156340 172.246763 0.787336 76963.666513 0.155585 2.170419 Interval 4
24233 102.938425 4.201764 335.964029 15.895931 200000.000000 0.186818 13.451801 Interval 1

1000 rows × 8 columns

In [18]:
y = dataTelecomSelectedSample['Intervals']
print('y', y.shape, y.unique()) # correle -> 1 bark -> on choisit celles dans correlation est loin
y (1000,) ['Interval 1', 'Interval 5', 'Interval 3', 'Interval 2', 'Interval 4']
Categories (5, object): ['Interval 1' < 'Interval 2' < 'Interval 3' < 'Interval 4' < 'Interval 5']
In [19]:
class_color = {'Interval 1': 'red', 'Interval 2': 'yellow', 'Interval 3': 'green','Interval 4':'blue','Interval 5':'violet','Interval 6':'white'}
#class_color = {'Interval 1': '#F3FF83', 'Interval 2': '#A0D287', 'Interval 3': '#4BA48D','Interval 4':'#05728A','Interval 5':'#1D4060'}
y_color = [class_color[c] for c in y]
print('y', y.shape, y.unique())
y (1000,) ['Interval 1', 'Interval 5', 'Interval 3', 'Interval 2', 'Interval 4']
Categories (5, object): ['Interval 1' < 'Interval 2' < 'Interval 3' < 'Interval 4' < 'Interval 5']
In [21]:
metric = 'euclidean'
method = 'ward'
cmap = 'coolwarm'

clustergrid = sns.clustermap(dataTelecomSelectedSample_scaled.T, figsize=(32, 20), metric=metric, method=method, cmap=cmap,
                             row_cluster=True, col_cluster=True, col_colors=y_color,
                             center=0.0, vmin=-4.8, vmax=4.8)

# Increase the font size of the column labels on the right
clustergrid.ax_heatmap.yaxis.tick_right()
clustergrid.ax_heatmap.yaxis.set_tick_params(labelsize=14)  # Adjust the font size as desired

#-1,4,9,183.9999,185
# Set the title
# intervals=[-0.1,5,10,16,21,217] # ---> ] .. , ..]
title = "Class = revenu_cdr  \n I1:]-0.1->5] -> red , I2:]5->10] -> yellow , I3:]10->16] -> green , I4]16-> 21] -> blue , I5]21-> 217] -> violet"
title_obj = clustergrid.fig.suptitle(title, fontsize=40, y=0.43,x=0.55, fontweight='bold')

# Add a border to the title
title_obj.set_bbox({"facecolor": "white", "edgecolor": "black", "linewidth": 2})
clustergrid.savefig('CAH.png')


#clustergrid.fig.suptitle(title, fontsize=40, y=0.45,x=0.65,color="#17B79C",fontstyle="oblique")


# Interval 1 -> red , Interval 2 -> orange , Interval 3 -> yellow , Interval 4 -> green , Interval 5 -> blue
In [22]:
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print("Average Silhouette Score:", silhouette_avg)
Average Silhouette Score: 0.1718211576611401
In [23]:
#-------------------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------[ Evaluation ]-------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------
In [25]:
from sklearn.metrics import calinski_harabasz_score

kmeans_labels = KmeansData['Cluster']
cah_cluster_labels = fcluster(Z, 999, criterion='distance')



calinski_kmeans = calinski_harabasz_score(X_scaled, kmeans_labels)
calinski_cah = calinski_harabasz_score(X_scaled, cah_cluster_labels)


print("Calinski-Harabasz Index - K-Means:", calinski_kmeans)
print("Calinski-Harabasz Index - CAH:", calinski_cah)
Calinski-Harabasz Index - K-Means: 130.28429037890933
Calinski-Harabasz Index - CAH: 2.0652054805288764
In [ ]:
# "Calinski-Harabasz Index - K-Means: 130.284" means that, when using the K-Means clustering algorithm, the index value is 2.065. This value indicates the quality of the clusters obtained with K-Means. A higher value suggests that the data points are well-separated into distinct clusters.
#
# "Calinski-Harabasz Index - CAH: 861.5468568011702" means that, when using the Complete Agglomerative Hierarchical (CAH) clustering algorithm, the index value is 861.5468568011702. This value indicates the quality of the clusters obtained with CAH. A higher value suggests that the clusters formed through hierarchical clustering are well-separated and well-defined.
#
# In both cases, a higher Calinski-Harabasz Index is generally desirable because it indicates a better separation of clusters. However, the specific interpretation of the values can vary depending on the dataset and the problem you are trying to solve. It's often used as a relative measure, comparing different clustering solutions to choose the one with the highest index.