#-----------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------[ Needed Libraries Import ]----------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
import easygui
import sweetviz as sv
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from ydata_profiling.controller.pandas_decorator import profile_report
import plotly.graph_objects as go
#-----------------------------------------------------------------------------------------------------------------------------
#---------------------------------------------------[ Import DataBase ]-------------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
dataTelecom=pd.read_excel('S:\\TunisieTelecom\\TelecomData.xlsx')
dataTelecom.head(10)
STATUT | OFFRE | ANC_M | HANDSET | revenu_voix | revenu_inter | NB_JOUR_ACTIVITE_TAXE | NB_JOUR_APPEL_TAXE | DUREE_APPEL_TOT | DUREE_APPEL_TAXEE | ... | FREQ_USSD_VOIX | FREQ_USSD_SMS | VOLUME_SESSION | VOLUME_SESSION_WEEKEND | REVENU_VAS | ARPU | P_revenu_data | P_revenu_voix_c | P_revenu_vas_c | id_client | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Active | Offre30 | 123 | 2G | 12.709375 | 1.464447 | 10.366042 | 10.658842 | 27.261836 | 2.125971 | ... | 28.679570 | 146.930634 | 200000.000000 | 54.025577 | 1.059822 | 4.325398 | 0.136685 | 0.943095 | 0.279541 | 1.0 |
1 | Active | Offre8 | 98 | 2G | 3.000000 | 0.756078 | 1.330736 | 7.079320 | 19.883099 | 7.735475 | ... | 28.490597 | 169.219363 | 3337.992419 | 24.789260 | 1.411803 | 2.677563 | 0.533695 | 0.540374 | 0.712010 | 2.0 |
2 | Active | Offre24 | 90 | 4G | 32.514156 | 0.681197 | 0.158160 | 4.367702 | 133.476368 | 7.681088 | ... | 190.000000 | 44.403308 | 107082.775926 | 160.031496 | 18.195224 | 1.954007 | 0.799606 | 0.071368 | 0.376754 | 3.0 |
3 | Active | Offre10 | 226 | 2G | 3.821551 | 5.265345 | 4.003452 | 3.086766 | 35.053364 | 8.540951 | ... | 19.412661 | 190.000000 | 128700.752169 | 102.668979 | 17.746873 | 2.496790 | 0.426526 | 0.314082 | 0.676112 | 4.0 |
4 | Active | Offre1 | 139 | 2G | 60.009385 | 1.957144 | 5.689241 | 0.403300 | 0.844956 | 12.355853 | ... | 81.567146 | 6.365788 | 75654.384291 | 0.341570 | 7.280861 | 2.291381 | 0.319736 | 0.600057 | 0.719450 | 5.0 |
5 | Suspended | Offre16 | 118 | 2G | 3.000000 | 1.636158 | 9.406248 | 3.346885 | 26.779409 | 14.407004 | ... | 34.922712 | 40.620738 | 65253.482895 | 33.274496 | 10.439648 | 6.212364 | 0.274582 | 0.419963 | 0.280881 | 6.0 |
6 | Active | Offre10 | 154 | 2G | 31.953274 | 1.656711 | 3.993698 | 5.357250 | 2.113332 | 20.677427 | ... | 34.152966 | 95.329383 | 45157.810902 | 41.821374 | 0.249189 | 1.432389 | 0.566405 | 0.152414 | 0.130876 | 7.0 |
7 | Suspended | Offre6 | 96 | 2G | 10.331387 | 2.498662 | 16.020874 | 0.100707 | 5.922602 | 7.850522 | ... | 65.508907 | 75.616780 | 37165.463625 | 65.573652 | 1.780176 | 3.982989 | 0.620034 | 0.369095 | 0.722737 | 8.0 |
8 | Active | Offre23 | 112 | 2G | 14.262607 | 8.888795 | 0.773295 | 2.281113 | 68.691901 | 0.827132 | ... | 190.000000 | 28.773361 | 200000.000000 | 35.931179 | 0.290472 | 1.772127 | 0.087045 | 0.512316 | 0.922651 | 9.0 |
9 | Active | Offre8 | 137 | 2G | 5.870684 | 0.824999 | 3.084907 | 11.425205 | 37.872839 | 19.419044 | ... | 50.228614 | 89.822382 | 65233.389597 | 36.545510 | 0.035157 | 0.874001 | 0.708388 | 0.766209 | 0.142649 | 10.0 |
10 rows × 64 columns
dataTelecomSample = dataTelecom.sample(n=1000)
#-----------------------------------------------------------------------------------------------------------------------------
#--------------------------------------------------[ Variables Profiling ]----------------------------------------------------
#-----------------------------------------------------------------------------------------------------------------------------
dataTelecom.describe()
ANC_M | revenu_voix | revenu_inter | NB_JOUR_ACTIVITE_TAXE | NB_JOUR_APPEL_TAXE | DUREE_APPEL_TOT | DUREE_APPEL_TAXEE | DUREE_APPEL_GRATUIT | nb_APPEL_GRATUIT | nb_APPEL_TAXE | ... | FREQ_USSD_VOIX | FREQ_USSD_SMS | VOLUME_SESSION | VOLUME_SESSION_WEEKEND | REVENU_VAS | ARPU | P_revenu_data | P_revenu_voix_c | P_revenu_vas_c | id_client | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | ... | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 92143.000000 | 9.214300e+04 | 92142.000000 |
mean | 149.171364 | 15.239636 | 1.498722 | 7.809857 | 6.874736 | 99.955509 | 15.062154 | 15.062154 | 30.006924 | 14.912332 | ... | 79.053113 | 79.064533 | 86696.838371 | 100.244944 | 5.011416 | 4.979407 | 0.501161 | 0.499919 | 5.008529e-01 | 46071.500000 |
std | 57.765160 | 14.682741 | 1.497972 | 7.283316 | 6.526537 | 99.949019 | 15.022923 | 15.022923 | 29.974072 | 14.947696 | ... | 61.947681 | 61.913723 | 66338.180671 | 99.833913 | 5.026841 | 4.997452 | 0.288310 | 0.288800 | 2.891844e-01 | 26599.248589 |
min | 50.000000 | 3.000000 | 0.000001 | 0.000112 | 0.000068 | 0.000018 | 0.000240 | 0.000240 | 0.000261 | 0.000060 | ... | 0.003447 | 0.000055 | 2.689071 | 0.000994 | 0.000015 | 0.000009 | 0.000001 | 0.000015 | 9.336428e-07 | 1.000000 |
25% | 99.000000 | 4.276194 | 0.431270 | 2.283187 | 2.010586 | 28.798354 | 4.329003 | 4.329003 | 8.679738 | 4.281366 | ... | 25.926583 | 25.960899 | 29194.406876 | 29.182456 | 1.427911 | 1.429946 | 0.251467 | 0.249248 | 2.503687e-01 | 23036.250000 |
50% | 149.000000 | 10.358993 | 1.042991 | 5.493399 | 4.870998 | 69.419866 | 10.499201 | 10.499201 | 20.703940 | 10.333792 | ... | 62.154732 | 62.309948 | 69415.161210 | 69.773265 | 3.450904 | 3.443868 | 0.502028 | 0.498710 | 5.013815e-01 | 46071.500000 |
75% | 199.000000 | 20.725777 | 2.078414 | 11.126519 | 9.664099 | 137.938430 | 20.898580 | 20.898580 | 41.583846 | 20.680499 | ... | 124.685338 | 124.695080 | 138643.945475 | 138.925907 | 6.956149 | 6.895372 | 0.750763 | 0.749972 | 7.525205e-01 | 69106.750000 |
max | 249.000000 | 168.860618 | 19.976912 | 30.000000 | 30.000000 | 1155.277214 | 164.188009 | 164.188009 | 368.313521 | 171.425214 | ... | 190.000000 | 190.000000 | 200000.000000 | 1222.280603 | 60.850935 | 51.063309 | 0.999988 | 1.000000 | 9.999926e-01 | 92142.000000 |
8 rows × 61 columns
# Assuming your dataframe is called 'df'
selected_variables = [
'DUREE_APPEL_TOT',
'nb_sms_tot' ,
'revenu_cdr_c',
'MNT_RECH',
'FREQ_ACT_OUT',
'NB_JOUR_APPEL_TAXE' ,
'NB_SMS_TAXE' ,
'NB_RECH_SUP5',
'MNT_TRANSFERT_OUT' ,
'revenu_voix' ,
'MNT_FORFAIT_DATA',
'NB_FORFAIT_VOIX' ,
'OFFRE',
'VOLUME_SESSION',
'P_revenu_data',
'MNT_FORFAIT',
'FREQ_USSD',
'P_FF_Data' ,
'Duree_onnet_tot',
'Duree_offnet_tot' ,
]
# Select the desired variables from the dataframe
dataTelecomSelected = dataTelecom[selected_variables].copy()
dataTelecomSelected.head(100)
DUREE_APPEL_TOT | nb_sms_tot | revenu_cdr_c | MNT_RECH | FREQ_ACT_OUT | NB_JOUR_APPEL_TAXE | NB_SMS_TAXE | NB_RECH_SUP5 | MNT_TRANSFERT_OUT | revenu_voix | MNT_FORFAIT_DATA | NB_FORFAIT_VOIX | OFFRE | VOLUME_SESSION | P_revenu_data | MNT_FORFAIT | FREQ_USSD | P_FF_Data | Duree_onnet_tot | Duree_offnet_tot | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 27.261836 | 3.685763 | 19.941537 | 12.306756 | 10.667904 | 10.658842 | 1.624059 | 2.00 | 8.877510 | 12.709375 | 2.059265 | 8.589441 | Offre30 | 200000.000000 | 0.136685 | 10.882398 | 190.000000 | 0.498091 | 30.583118 | 27.219712 |
1 | 19.883099 | 1.476121 | 3.908449 | 10.348607 | 13.650949 | 7.079320 | 7.059615 | 0.67 | 1.633333 | 3.000000 | 20.025208 | 5.685491 | Offre8 | 3337.992419 | 0.533695 | 1.049276 | 18.623378 | 0.096339 | 13.426765 | 32.571522 |
2 | 133.476368 | 6.162948 | 32.541849 | 3.939266 | 7.079317 | 4.367702 | 1.638128 | 0.00 | 0.011593 | 32.514156 | 5.397060 | 4.417241 | Offre24 | 107082.775926 | 0.799606 | 4.041068 | 66.449461 | 0.667540 | 6.217262 | 24.861880 |
3 | 35.053364 | 1.448782 | 4.145009 | 2.408488 | 17.423588 | 3.086766 | 0.016726 | 0.33 | 2.003034 | 3.821551 | 0.534564 | 0.107662 | Offre10 | 128700.752169 | 0.426526 | 6.441283 | 19.259807 | 0.407815 | 5.837191 | 37.779013 |
4 | 0.844956 | 1.131308 | 60.496821 | 10.870592 | 0.067246 | 0.403300 | 3.934964 | 0.33 | 2.755649 | 60.009385 | 0.031219 | 2.276588 | Offre1 | 75654.384291 | 0.319736 | 0.805758 | 38.034894 | 0.828712 | 10.395156 | 2.831625 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
95 | 1.826910 | 3.198013 | 5.638954 | 5.605779 | 16.488487 | 2.872333 | 0.420523 | 0.00 | 2.242692 | 3.000000 | 7.558205 | 8.874387 | Offre8 | 32664.259485 | 0.857627 | 2.735164 | 41.723923 | 0.075043 | 2.354877 | 6.515707 |
96 | 80.858548 | 11.867689 | 8.145327 | 9.174032 | 13.042782 | 0.298674 | 0.329446 | 0.33 | 11.978245 | 3.000000 | 31.046581 | 2.124691 | Offre2 | 82909.748831 | 0.980014 | 6.002234 | 177.422094 | 0.971460 | 0.183394 | 49.746824 |
97 | 107.330103 | 12.026432 | 6.377507 | 16.425132 | 2.520254 | 6.400014 | 0.631907 | 1.33 | 3.024095 | 4.531545 | 4.918999 | 1.420871 | Offre10 | 141996.977868 | 0.772398 | 1.815659 | 76.287668 | 0.558104 | 121.321271 | 2.314348 |
98 | 86.011908 | 4.033539 | 20.611835 | 0.200019 | 0.963337 | 10.629819 | 0.790306 | 2.67 | 16.973030 | 11.762926 | 17.817781 | 7.554069 | Offre15 | 12052.670191 | 0.152620 | 12.026577 | 143.305932 | 0.605920 | 15.517343 | 28.809924 |
99 | 89.864928 | 0.624341 | 22.949669 | 2.234320 | 0.512487 | 0.184009 | 4.610618 | 0.00 | 3.720673 | 17.606096 | 0.793076 | 0.566853 | Offre30 | 134081.459601 | 0.893336 | 2.228678 | 79.240784 | 0.822927 | 78.564971 | 44.972508 |
100 rows × 20 columns
nan_counts = dataTelecomSelected.isna().sum(axis=1)
num_rows_with_nan = (nan_counts > 0).sum()
print("Number of rows with NaN values:", num_rows_with_nan)
Number of rows with NaN values: 0
profile=profile_report(dataTelecomSelected)
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]