#-----------------------------------------------------------------------------------------------------------------------------
#------------------------------------------------[ Libraries Imports ]--------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------


            
              import pandas as pd
from sklearn import preprocessing
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


            
              #-----------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------[ Import DataBase ]-------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------


            
              dataTelecom=pd.read_excel('S:\\TunisieTelecom\\TelecomData.xlsx')
dataTelecom.head(100)


            
              # selection of the most important variables for each clustering
selected_variables = [
    'FREQ_USSD', 'revenu_cdr_c','DUREE_APPEL_TOT','MNT_RECH','VOLUME_SESSION', 'MNT_FORFAIT_DATA','FREQ_ACT_OUT'
]

# Select the desired variables from the dataframe
dataTelecomSelected = dataTelecom[selected_variables].copy()


            
              dataTelecomSelectedSample=dataTelecomSelected.sample(n=1000)
KmeansData = dataTelecomSelectedSample.copy()


            
              dataTelecomSelectedSample.head(1000)


            
              #-----------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------[ Clustering ]------------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------


            
              #-------------------------------------> Elbow Method


            
              # Standardize the data to have zero mean and unit variance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(KmeansData)

# Determine the optimal number of clusters using the Elbow Method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Method to find the optimal number of clusters
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-cluster Sum of Squares (Inertia)')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()


            
              # K =6
# Perform k-means clustering with the selected number of clusters
k = 6
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add the cluster labels to the original DataFrame
KmeansData['Cluster'] = cluster_labels

# Explore the cluster characteristics (e.g., mean values of each variable per cluster)
cluster_characteristics = KmeansData.groupby('Cluster')[selected_variables].mean()

# Print the cluster characteristics
print(cluster_characteristics)

# You can further analyze and interpret the clusters to gain insights into customer segments based on usage behavior.


            
              #Higher Silhouette scores indicate better-defined clusters.
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print("Average Silhouette Score:", silhouette_avg)

Average Silhouette Score: 0.1718211576611401


            
              # ---> CAH
plt.figure(figsize=(20, 16))
#générer la matrice des liens

Z = linkage(dataTelecomSelectedSample,method='ward', metric='euclidean')

plt.title("CAH")

dendrogram(Z,labels=dataTelecomSelectedSample.index,orientation='left',color_threshold=0)

plt.show()


            
              # ---> Centrage et Reduction


            
              scaler = preprocessing.StandardScaler()
dataTelecomSelectedSample_scaled = scaler.fit_transform(dataTelecomSelectedSample)
dataTelecomSelectedSample_scaled = pd.DataFrame(dataTelecomSelectedSample_scaled, index=dataTelecomSelectedSample.index, columns=dataTelecomSelectedSample.columns)
print('X_scaled', dataTelecomSelectedSample_scaled.shape)

X_scaled (1000, 7)


            
              # Calculate the interval length
max_value = dataTelecomSelectedSample['revenu_cdr_c'].max()
interval_length = max_value / 5

# Define the intervals
intervals=[-0.1,5,10,16,21,217] # ---> ] .. , ..]
#intervals = np.arange(-1, max_value + interval_length , interval_length)  # Adding a small value for inclusiveness

# Assign interval labels
labels = ['Interval 1', 'Interval 2', 'Interval 3', 'Interval 4', 'Interval 5']

# Transform column values into intervals
dataTelecomSelectedSample['Intervals'] = pd.cut(dataTelecomSelectedSample['revenu_cdr_c'], bins=intervals, labels=labels, right=True)

dataTelecomSelectedSample.head(1000)


            
              y = dataTelecomSelectedSample['Intervals']
print('y', y.shape, y.unique()) # correle -> 1 bark -> on choisit celles dans correlation est loin

y (1000,) ['Interval 1', 'Interval 5', 'Interval 3', 'Interval 2', 'Interval 4']
Categories (5, object): ['Interval 1' < 'Interval 2' < 'Interval 3' < 'Interval 4' < 'Interval 5']


            
              class_color = {'Interval 1': 'red', 'Interval 2': 'yellow', 'Interval 3': 'green','Interval 4':'blue','Interval 5':'violet','Interval 6':'white'}
#class_color = {'Interval 1': '#F3FF83', 'Interval 2': '#A0D287', 'Interval 3': '#4BA48D','Interval 4':'#05728A','Interval 5':'#1D4060'}
y_color = [class_color[c] for c in y]
print('y', y.shape, y.unique())

y (1000,) ['Interval 1', 'Interval 5', 'Interval 3', 'Interval 2', 'Interval 4']
Categories (5, object): ['Interval 1' < 'Interval 2' < 'Interval 3' < 'Interval 4' < 'Interval 5']


            
              metric = 'euclidean'
method = 'ward'
cmap = 'coolwarm'

clustergrid = sns.clustermap(dataTelecomSelectedSample_scaled.T, figsize=(32, 20), metric=metric, method=method, cmap=cmap,
                             row_cluster=True, col_cluster=True, col_colors=y_color,
                             center=0.0, vmin=-4.8, vmax=4.8)

# Increase the font size of the column labels on the right
clustergrid.ax_heatmap.yaxis.tick_right()
clustergrid.ax_heatmap.yaxis.set_tick_params(labelsize=14)  # Adjust the font size as desired

#-1,4,9,183.9999,185
# Set the title
# intervals=[-0.1,5,10,16,21,217] # ---> ] .. , ..]
title = "Class = revenu_cdr  \n I1:]-0.1->5] -> red , I2:]5->10] -> yellow , I3:]10->16] -> green , I4]16-> 21] -> blue , I5]21-> 217] -> violet"
title_obj = clustergrid.fig.suptitle(title, fontsize=40, y=0.43,x=0.55, fontweight='bold')

# Add a border to the title
title_obj.set_bbox({"facecolor": "white", "edgecolor": "black", "linewidth": 2})
clustergrid.savefig('CAH.png')


#clustergrid.fig.suptitle(title, fontsize=40, y=0.45,x=0.65,color="#17B79C",fontstyle="oblique")


# Interval 1 -> red , Interval 2 -> orange , Interval 3 -> yellow , Interval 4 -> green , Interval 5 -> blue


            
              silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print("Average Silhouette Score:", silhouette_avg)

Average Silhouette Score: 0.1718211576611401


            
              #-------------------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------[ Evaluation ]-------------------------------------------------------
#-------------------------------------------------------------------------------------------------------------------------------------


            
              from sklearn.metrics import calinski_harabasz_score

kmeans_labels = KmeansData['Cluster']
cah_cluster_labels = fcluster(Z, 999, criterion='distance')



calinski_kmeans = calinski_harabasz_score(X_scaled, kmeans_labels)
calinski_cah = calinski_harabasz_score(X_scaled, cah_cluster_labels)


print("Calinski-Harabasz Index - K-Means:", calinski_kmeans)
print("Calinski-Harabasz Index - CAH:", calinski_cah)

Calinski-Harabasz Index - K-Means: 130.28429037890933
Calinski-Harabasz Index - CAH: 2.0652054805288764


            
              # "Calinski-Harabasz Index - K-Means: 130.284" means that, when using the K-Means clustering algorithm, the index value is 2.065. This value indicates the quality of the clusters obtained with K-Means. A higher value suggests that the data points are well-separated into distinct clusters.
#
# "Calinski-Harabasz Index - CAH: 861.5468568011702" means that, when using the Complete Agglomerative Hierarchical (CAH) clustering algorithm, the index value is 861.5468568011702. This value indicates the quality of the clusters obtained with CAH. A higher value suggests that the clusters formed through hierarchical clustering are well-separated and well-defined.
#
# In both cases, a higher Calinski-Harabasz Index is generally desirable because it indicates a better separation of clusters. However, the specific interpretation of the values can vary depending on the dataset and the problem you are trying to solve. It's often used as a relative measure, comparing different clustering solutions to choose the one with the highest index.

	STATUT	OFFRE	ANC_M	HANDSET	revenu_voix	revenu_inter	NB_JOUR_ACTIVITE_TAXE	NB_JOUR_APPEL_TAXE	DUREE_APPEL_TOT	DUREE_APPEL_TAXEE	...	FREQ_USSD_VOIX	FREQ_USSD_SMS	VOLUME_SESSION	VOLUME_SESSION_WEEKEND	REVENU_VAS	ARPU	P_revenu_data	P_revenu_voix_c	P_revenu_vas_c	id_client
0	Active	Offre30	123	2G	12.709375	1.464447	10.366042	10.658842	27.261836	2.125971	...	28.679570	146.930634	200000.000000	54.025577	1.059822	4.325398	0.136685	0.943095	0.279541	1.0
1	Active	Offre8	98	2G	3.000000	0.756078	1.330736	7.079320	19.883099	7.735475	...	28.490597	169.219363	3337.992419	24.789260	1.411803	2.677563	0.533695	0.540374	0.712010	2.0
2	Active	Offre24	90	4G	32.514156	0.681197	0.158160	4.367702	133.476368	7.681088	...	190.000000	44.403308	107082.775926	160.031496	18.195224	1.954007	0.799606	0.071368	0.376754	3.0
3	Active	Offre10	226	2G	3.821551	5.265345	4.003452	3.086766	35.053364	8.540951	...	19.412661	190.000000	128700.752169	102.668979	17.746873	2.496790	0.426526	0.314082	0.676112	4.0
4	Active	Offre1	139	2G	60.009385	1.957144	5.689241	0.403300	0.844956	12.355853	...	81.567146	6.365788	75654.384291	0.341570	7.280861	2.291381	0.319736	0.600057	0.719450	5.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
95	Active	Offre8	159	2G	3.000000	2.250293	21.961465	2.872333	1.826910	27.907662	...	12.954468	23.226754	32664.259485	21.276030	6.134130	0.689345	0.857627	0.392768	0.657034	96.0
96	Suspended	Offre2	145	2G	3.000000	3.473084	3.575555	0.298674	80.858548	5.349497	...	153.474792	190.000000	82909.748831	26.213769	1.504337	0.164892	0.980014	0.467268	0.150409	97.0
97	Active	Offre10	185	2G	4.531545	2.142349	3.470616	6.400014	107.330103	5.199869	...	157.487581	36.176280	141996.977868	42.321968	1.637815	1.848815	0.772398	0.407095	0.545033	98.0
98	Active	Offre15	197	3G	11.762926	3.474145	2.842419	10.629819	86.011908	0.124836	...	159.332900	77.858228	12052.670191	46.685004	1.422138	1.944146	0.152620	0.243845	0.387906	99.0
99	Suspended	Offre30	178	2G	17.606096	0.019640	1.850818	0.184009	89.864928	5.858613	...	18.339092	47.120168	134081.459601	10.649038	0.640634	2.204618	0.893336	0.293854	0.121368	100.0

	FREQ_USSD	revenu_cdr_c	DUREE_APPEL_TOT	MNT_RECH	VOLUME_SESSION	MNT_FORFAIT_DATA	FREQ_ACT_OUT
8272	117.715662	3.609806	132.448822	9.867506	170666.351197	19.796220	4.163475
27269	51.607567	25.120910	96.453174	21.567470	150130.855066	7.514821	2.672553
67156	81.423246	15.146389	101.062875	5.264629	15861.113848	0.865813	1.686709
57271	41.336494	5.563921	6.133606	19.266939	38589.945279	0.245570	2.135797
31677	87.387539	8.839919	401.105486	1.699672	135452.180362	1.941672	12.948769
...	...	...	...	...	...	...	...
21195	3.678090	6.354561	34.260661	4.839633	42362.792746	5.064106	8.070536
41095	81.029405	28.014919	45.847492	9.183478	7338.678331	31.952191	15.306484
75243	93.898015	6.523911	8.333489	4.737723	117516.039349	7.677401	2.275896
26227	7.331037	19.156340	172.246763	0.787336	76963.666513	0.155585	2.170419
24233	102.938425	4.201764	335.964029	15.895931	200000.000000	0.186818	13.451801

	FREQ_USSD	revenu_cdr_c	DUREE_APPEL_TOT	MNT_RECH	VOLUME_SESSION	MNT_FORFAIT_DATA	FREQ_ACT_OUT	Intervals
8272	117.715662	3.609806	132.448822	9.867506	170666.351197	19.796220	4.163475	Interval 1
27269	51.607567	25.120910	96.453174	21.567470	150130.855066	7.514821	2.672553	Interval 5
67156	81.423246	15.146389	101.062875	5.264629	15861.113848	0.865813	1.686709	Interval 3
57271	41.336494	5.563921	6.133606	19.266939	38589.945279	0.245570	2.135797	Interval 2
31677	87.387539	8.839919	401.105486	1.699672	135452.180362	1.941672	12.948769	Interval 2
...	...	...	...	...	...	...	...	...
21195	3.678090	6.354561	34.260661	4.839633	42362.792746	5.064106	8.070536	Interval 2
41095	81.029405	28.014919	45.847492	9.183478	7338.678331	31.952191	15.306484	Interval 5
75243	93.898015	6.523911	8.333489	4.737723	117516.039349	7.677401	2.275896	Interval 2
26227	7.331037	19.156340	172.246763	0.787336	76963.666513	0.155585	2.170419	Interval 4
24233	102.938425	4.201764	335.964029	15.895931	200000.000000	0.186818	13.451801	Interval 1