In [1]:
#-----------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------[ Needed Libraries Import ]----------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
In [2]:
import easygui
import sweetviz as sv
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from ydata_profiling.controller.pandas_decorator import profile_report
import plotly.graph_objects as go
In [3]:
#-----------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------[ Import DataBase ]-------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
In [4]:
dataTelecom=pd.read_excel('S:\\TunisieTelecom\\TelecomData.xlsx')
dataTelecom.head(10)
Out[4]:
STATUT OFFRE ANC_M HANDSET revenu_voix revenu_inter NB_JOUR_ACTIVITE_TAXE NB_JOUR_APPEL_TAXE DUREE_APPEL_TOT DUREE_APPEL_TAXEE ... FREQ_USSD_VOIX FREQ_USSD_SMS VOLUME_SESSION VOLUME_SESSION_WEEKEND REVENU_VAS ARPU P_revenu_data P_revenu_voix_c P_revenu_vas_c id_client
0 Active Offre30 123 2G 12.709375 1.464447 10.366042 10.658842 27.261836 2.125971 ... 28.679570 146.930634 200000.000000 54.025577 1.059822 4.325398 0.136685 0.943095 0.279541 1.0
1 Active Offre8 98 2G 3.000000 0.756078 1.330736 7.079320 19.883099 7.735475 ... 28.490597 169.219363 3337.992419 24.789260 1.411803 2.677563 0.533695 0.540374 0.712010 2.0
2 Active Offre24 90 4G 32.514156 0.681197 0.158160 4.367702 133.476368 7.681088 ... 190.000000 44.403308 107082.775926 160.031496 18.195224 1.954007 0.799606 0.071368 0.376754 3.0
3 Active Offre10 226 2G 3.821551 5.265345 4.003452 3.086766 35.053364 8.540951 ... 19.412661 190.000000 128700.752169 102.668979 17.746873 2.496790 0.426526 0.314082 0.676112 4.0
4 Active Offre1 139 2G 60.009385 1.957144 5.689241 0.403300 0.844956 12.355853 ... 81.567146 6.365788 75654.384291 0.341570 7.280861 2.291381 0.319736 0.600057 0.719450 5.0
5 Suspended Offre16 118 2G 3.000000 1.636158 9.406248 3.346885 26.779409 14.407004 ... 34.922712 40.620738 65253.482895 33.274496 10.439648 6.212364 0.274582 0.419963 0.280881 6.0
6 Active Offre10 154 2G 31.953274 1.656711 3.993698 5.357250 2.113332 20.677427 ... 34.152966 95.329383 45157.810902 41.821374 0.249189 1.432389 0.566405 0.152414 0.130876 7.0
7 Suspended Offre6 96 2G 10.331387 2.498662 16.020874 0.100707 5.922602 7.850522 ... 65.508907 75.616780 37165.463625 65.573652 1.780176 3.982989 0.620034 0.369095 0.722737 8.0
8 Active Offre23 112 2G 14.262607 8.888795 0.773295 2.281113 68.691901 0.827132 ... 190.000000 28.773361 200000.000000 35.931179 0.290472 1.772127 0.087045 0.512316 0.922651 9.0
9 Active Offre8 137 2G 5.870684 0.824999 3.084907 11.425205 37.872839 19.419044 ... 50.228614 89.822382 65233.389597 36.545510 0.035157 0.874001 0.708388 0.766209 0.142649 10.0

10 rows × 64 columns

In [5]:
dataTelecomSample = dataTelecom.sample(n=1000)
In [6]:
#-----------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------[ Variables Profiling ]----------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
In [7]:
dataTelecom.describe()
Out[7]:
ANC_M revenu_voix revenu_inter NB_JOUR_ACTIVITE_TAXE NB_JOUR_APPEL_TAXE DUREE_APPEL_TOT DUREE_APPEL_TAXEE DUREE_APPEL_GRATUIT nb_APPEL_GRATUIT nb_APPEL_TAXE ... FREQ_USSD_VOIX FREQ_USSD_SMS VOLUME_SESSION VOLUME_SESSION_WEEKEND REVENU_VAS ARPU P_revenu_data P_revenu_voix_c P_revenu_vas_c id_client
count 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 ... 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 92143.000000 9.214300e+04 92142.000000
mean 149.171364 15.239636 1.498722 7.809857 6.874736 99.955509 15.062154 15.062154 30.006924 14.912332 ... 79.053113 79.064533 86696.838371 100.244944 5.011416 4.979407 0.501161 0.499919 5.008529e-01 46071.500000
std 57.765160 14.682741 1.497972 7.283316 6.526537 99.949019 15.022923 15.022923 29.974072 14.947696 ... 61.947681 61.913723 66338.180671 99.833913 5.026841 4.997452 0.288310 0.288800 2.891844e-01 26599.248589
min 50.000000 3.000000 0.000001 0.000112 0.000068 0.000018 0.000240 0.000240 0.000261 0.000060 ... 0.003447 0.000055 2.689071 0.000994 0.000015 0.000009 0.000001 0.000015 9.336428e-07 1.000000
25% 99.000000 4.276194 0.431270 2.283187 2.010586 28.798354 4.329003 4.329003 8.679738 4.281366 ... 25.926583 25.960899 29194.406876 29.182456 1.427911 1.429946 0.251467 0.249248 2.503687e-01 23036.250000
50% 149.000000 10.358993 1.042991 5.493399 4.870998 69.419866 10.499201 10.499201 20.703940 10.333792 ... 62.154732 62.309948 69415.161210 69.773265 3.450904 3.443868 0.502028 0.498710 5.013815e-01 46071.500000
75% 199.000000 20.725777 2.078414 11.126519 9.664099 137.938430 20.898580 20.898580 41.583846 20.680499 ... 124.685338 124.695080 138643.945475 138.925907 6.956149 6.895372 0.750763 0.749972 7.525205e-01 69106.750000
max 249.000000 168.860618 19.976912 30.000000 30.000000 1155.277214 164.188009 164.188009 368.313521 171.425214 ... 190.000000 190.000000 200000.000000 1222.280603 60.850935 51.063309 0.999988 1.000000 9.999926e-01 92142.000000

8 rows × 61 columns

In [8]:
# Assuming your dataframe is called 'df'
selected_variables = [
'DUREE_APPEL_TOT',
'nb_sms_tot' ,
'revenu_cdr_c',
'MNT_RECH',
'FREQ_ACT_OUT',
'NB_JOUR_APPEL_TAXE' ,
'NB_SMS_TAXE' ,
'NB_RECH_SUP5',
'MNT_TRANSFERT_OUT' ,
'revenu_voix' ,
'MNT_FORFAIT_DATA',
'NB_FORFAIT_VOIX' ,
'OFFRE',
'VOLUME_SESSION',
'P_revenu_data',
'MNT_FORFAIT',
'FREQ_USSD',
'P_FF_Data' ,
'Duree_onnet_tot',
'Duree_offnet_tot' ,
]
# Select the desired variables from the dataframe
dataTelecomSelected = dataTelecom[selected_variables].copy()
dataTelecomSelected.head(100)
Out[8]:
DUREE_APPEL_TOT nb_sms_tot revenu_cdr_c MNT_RECH FREQ_ACT_OUT NB_JOUR_APPEL_TAXE NB_SMS_TAXE NB_RECH_SUP5 MNT_TRANSFERT_OUT revenu_voix MNT_FORFAIT_DATA NB_FORFAIT_VOIX OFFRE VOLUME_SESSION P_revenu_data MNT_FORFAIT FREQ_USSD P_FF_Data Duree_onnet_tot Duree_offnet_tot
0 27.261836 3.685763 19.941537 12.306756 10.667904 10.658842 1.624059 2.00 8.877510 12.709375 2.059265 8.589441 Offre30 200000.000000 0.136685 10.882398 190.000000 0.498091 30.583118 27.219712
1 19.883099 1.476121 3.908449 10.348607 13.650949 7.079320 7.059615 0.67 1.633333 3.000000 20.025208 5.685491 Offre8 3337.992419 0.533695 1.049276 18.623378 0.096339 13.426765 32.571522
2 133.476368 6.162948 32.541849 3.939266 7.079317 4.367702 1.638128 0.00 0.011593 32.514156 5.397060 4.417241 Offre24 107082.775926 0.799606 4.041068 66.449461 0.667540 6.217262 24.861880
3 35.053364 1.448782 4.145009 2.408488 17.423588 3.086766 0.016726 0.33 2.003034 3.821551 0.534564 0.107662 Offre10 128700.752169 0.426526 6.441283 19.259807 0.407815 5.837191 37.779013
4 0.844956 1.131308 60.496821 10.870592 0.067246 0.403300 3.934964 0.33 2.755649 60.009385 0.031219 2.276588 Offre1 75654.384291 0.319736 0.805758 38.034894 0.828712 10.395156 2.831625
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95 1.826910 3.198013 5.638954 5.605779 16.488487 2.872333 0.420523 0.00 2.242692 3.000000 7.558205 8.874387 Offre8 32664.259485 0.857627 2.735164 41.723923 0.075043 2.354877 6.515707
96 80.858548 11.867689 8.145327 9.174032 13.042782 0.298674 0.329446 0.33 11.978245 3.000000 31.046581 2.124691 Offre2 82909.748831 0.980014 6.002234 177.422094 0.971460 0.183394 49.746824
97 107.330103 12.026432 6.377507 16.425132 2.520254 6.400014 0.631907 1.33 3.024095 4.531545 4.918999 1.420871 Offre10 141996.977868 0.772398 1.815659 76.287668 0.558104 121.321271 2.314348
98 86.011908 4.033539 20.611835 0.200019 0.963337 10.629819 0.790306 2.67 16.973030 11.762926 17.817781 7.554069 Offre15 12052.670191 0.152620 12.026577 143.305932 0.605920 15.517343 28.809924
99 89.864928 0.624341 22.949669 2.234320 0.512487 0.184009 4.610618 0.00 3.720673 17.606096 0.793076 0.566853 Offre30 134081.459601 0.893336 2.228678 79.240784 0.822927 78.564971 44.972508

100 rows × 20 columns

In [9]:
nan_counts = dataTelecomSelected.isna().sum(axis=1)
num_rows_with_nan = (nan_counts > 0).sum()
print("Number of rows with NaN values:", num_rows_with_nan)
Number of rows with NaN values: 0
In [12]:
profile=profile_report(dataTelecomSelected)
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[12]:

In [13]:
#-----------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------[ Label Encoder ]----------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
In [14]:
# Create an instance of LabelEncoder
label_encoder = LabelEncoder()
dataTelecomEncoded=dataTelecom.copy()
# Fit and transform the column you want to encode
encoded_statut = label_encoder.fit_transform(dataTelecomEncoded['STATUT'])
encoded_offre = label_encoder.fit_transform(dataTelecomEncoded['OFFRE'])
encoded_handset = label_encoder.fit_transform(dataTelecomEncoded['HANDSET'])

# Replace the original column with the encoded values
dataTelecomEncoded['STATUT'] = encoded_statut
dataTelecomEncoded['OFFRE'] = encoded_offre
dataTelecomEncoded['HANDSET'] = encoded_handset
dataTelecomEncoded.head(20)
Out[14]:
STATUT OFFRE ANC_M HANDSET revenu_voix revenu_inter NB_JOUR_ACTIVITE_TAXE NB_JOUR_APPEL_TAXE DUREE_APPEL_TOT DUREE_APPEL_TAXEE ... FREQ_USSD_VOIX FREQ_USSD_SMS VOLUME_SESSION VOLUME_SESSION_WEEKEND REVENU_VAS ARPU P_revenu_data P_revenu_voix_c P_revenu_vas_c id_client
0 0 23 123 0 12.709375 1.464447 10.366042 10.658842 27.261836 2.125971 ... 28.679570 146.930634 200000.000000 54.025577 1.059822 4.325398 0.136685 0.943095 0.279541 1.0
1 0 28 98 0 3.000000 0.756078 1.330736 7.079320 19.883099 7.735475 ... 28.490597 169.219363 3337.992419 24.789260 1.411803 2.677563 0.533695 0.540374 0.712010 2.0
2 0 16 90 2 32.514156 0.681197 0.158160 4.367702 133.476368 7.681088 ... 190.000000 44.403308 107082.775926 160.031496 18.195224 1.954007 0.799606 0.071368 0.376754 3.0
3 0 1 226 0 3.821551 5.265345 4.003452 3.086766 35.053364 8.540951 ... 19.412661 190.000000 128700.752169 102.668979 17.746873 2.496790 0.426526 0.314082 0.676112 4.0
4 0 0 139 0 60.009385 1.957144 5.689241 0.403300 0.844956 12.355853 ... 81.567146 6.365788 75654.384291 0.341570 7.280861 2.291381 0.319736 0.600057 0.719450 5.0
5 1 7 118 0 3.000000 1.636158 9.406248 3.346885 26.779409 14.407004 ... 34.922712 40.620738 65253.482895 33.274496 10.439648 6.212364 0.274582 0.419963 0.280881 6.0
6 0 1 154 0 31.953274 1.656711 3.993698 5.357250 2.113332 20.677427 ... 34.152966 95.329383 45157.810902 41.821374 0.249189 1.432389 0.566405 0.152414 0.130876 7.0
7 1 26 96 0 10.331387 2.498662 16.020874 0.100707 5.922602 7.850522 ... 65.508907 75.616780 37165.463625 65.573652 1.780176 3.982989 0.620034 0.369095 0.722737 8.0
8 0 15 112 0 14.262607 8.888795 0.773295 2.281113 68.691901 0.827132 ... 190.000000 28.773361 200000.000000 35.931179 0.290472 1.772127 0.087045 0.512316 0.922651 9.0
9 0 28 137 0 5.870684 0.824999 3.084907 11.425205 37.872839 19.419044 ... 50.228614 89.822382 65233.389597 36.545510 0.035157 0.874001 0.708388 0.766209 0.142649 10.0
10 1 27 226 1 7.088535 0.283055 8.315342 1.366076 2.214913 41.120205 ... 44.067410 37.340224 12190.119629 48.818017 0.138935 0.658657 0.788564 0.017558 0.923704 11.0
11 1 21 128 0 16.149193 3.850653 3.196988 8.597772 47.642021 29.893714 ... 166.455950 10.524902 43574.175204 18.537058 7.382145 13.366764 0.948362 0.258017 0.969015 12.0
12 0 2 183 0 11.716227 1.283955 13.915028 12.128053 22.875005 1.569514 ... 10.024603 113.904802 17217.333644 8.255863 1.179744 2.731058 0.230116 0.347973 0.650547 13.0
13 1 23 172 0 6.503985 0.096397 2.708102 30.000000 40.633315 2.772596 ... 53.472655 33.181023 200000.000000 33.145742 0.619742 2.244286 0.810021 0.711795 0.392277 14.0
14 0 0 101 0 25.101203 3.188984 17.604631 0.099994 250.171717 8.930461 ... 10.210870 112.839109 48378.428305 23.698705 1.241074 11.912208 0.778765 0.978677 0.859013 15.0
15 1 28 200 0 3.000000 6.272574 0.296121 6.216673 28.286259 16.845520 ... 76.036535 60.189079 125226.666808 28.999614 1.578759 6.275846 0.387570 0.785484 0.732166 16.0
16 0 10 85 0 13.091479 2.178118 4.339566 0.919820 213.849629 7.779880 ... 89.554035 72.450946 39052.915945 6.732413 0.494723 4.957723 0.581744 0.438515 0.525448 17.0
17 0 24 185 0 3.000000 3.727984 5.917777 4.932776 99.232097 17.134669 ... 9.215928 50.604892 115951.083186 60.904464 13.264392 7.963693 0.662598 0.920679 0.150350 18.0
18 1 12 163 0 3.000000 0.153387 4.543918 2.796197 206.324350 11.374446 ... 8.454184 6.669602 200000.000000 67.454921 5.150976 4.218149 0.431221 0.228147 0.994235 19.0
19 0 0 217 0 13.331367 2.774178 6.488598 3.763119 8.618444 1.289124 ... 35.583323 8.314417 16382.942987 600.878815 12.867072 6.429398 0.450828 0.469069 0.654148 20.0

20 rows × 64 columns

In [15]:
#-----------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------[ Correlation Matrix ]----------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
In [16]:
correlation_matrix = dataTelecomEncoded.corr()
In [17]:
fig = go.Figure(data=go.Heatmap(z=correlation_matrix.values,
                                x=correlation_matrix.columns,
                                y=correlation_matrix.index))

# Save the figure as a PNG file
fig.update_layout(height=800)
fig.write_html("correlation_matrix13.html")
fig.show()
In [18]:
#-------------------------------------------------------------------------------------------------------------------------
#---------------------------------------    Correlation Matrix > 0.5     -------------------------------------------------
#0.5 to 0.7: Substantial (significant) correlation | 0.7 to 0.9: Strong correlation | 0.9 to 1.0: Very strong correlation
#-------------------------------------------------------------------------------------------------------------------------


# Filter the correlation matrix
filtered_matrix = correlation_matrix[correlation_matrix > 0.5]

fig = go.Figure(data=go.Heatmap(z=filtered_matrix.values,
                                x=filtered_matrix.columns,
                                y=filtered_matrix.index))

# Update the layout to increase the height
fig.update_layout(height=600)

# Save the figure as a PNG file
fig.show()