import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

from gower import gower_matrix

from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram

from pyclustertend import hopkins

%matplotlib inline


df = pd.read_csv("BankChurners.csv")
df.head()


# Drop the first and last 2 columns
# df - Original Dataset without Client Number and 'Naive Bayes ~' columns
df = df.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis = 1)
df.head()


# Check whether there are any missing values in the dataset
df.dtypes

Attrition_Flag               object
Customer_Age                  int64
Gender                       object
Dependent_count               int64
Education_Level              object
Marital_Status               object
Income_Category              object
Card_Category                object
Months_on_book                int64
Total_Relationship_Count      int64
Months_Inactive_12_mon        int64
Contacts_Count_12_mon         int64
Credit_Limit                float64
Total_Revolving_Bal           int64
Avg_Open_To_Buy             float64
Total_Amt_Chng_Q4_Q1        float64
Total_Trans_Amt               int64
Total_Trans_Ct                int64
Total_Ct_Chng_Q4_Q1         float64
Avg_Utilization_Ratio       float64
dtype: object


for col in df.columns:
    print("Unique Values of %s"%(col))
    print(df[col].unique())
    print(df[col].dtypes)
    print('--------------------------------')

Unique Values of Attrition_Flag
['Existing Customer' 'Attrited Customer']
object
--------------------------------
Unique Values of Customer_Age
[45 49 51 40 44 32 37 48 42 65 56 35 57 41 61 47 62 54 59 63 53 58 55 66
 50 38 46 52 39 43 64 68 67 60 73 70 36 34 33 26 31 29 30 28 27]
int64
--------------------------------
Unique Values of Gender
['M' 'F']
object
--------------------------------
Unique Values of Dependent_count
[3 5 4 2 0 1]
int64
--------------------------------
Unique Values of Education_Level
['High School' 'Graduate' 'Uneducated' 'Unknown' 'College' 'Post-Graduate'
 'Doctorate']
object
--------------------------------
Unique Values of Marital_Status
['Married' 'Single' 'Unknown' 'Divorced']
object
--------------------------------
Unique Values of Income_Category
['$60K - $80K' 'Less than $40K' '$80K - $120K' '$40K - $60K' '$120K +'
 'Unknown']
object
--------------------------------
Unique Values of Card_Category
['Blue' 'Gold' 'Silver' 'Platinum']
object
--------------------------------
Unique Values of Months_on_book
[39 44 36 34 21 46 27 31 54 30 48 37 56 42 49 33 28 38 41 43 45 52 40 50
 35 47 32 20 29 25 53 24 55 23 22 26 13 51 19 15 17 18 16 14]
int64
--------------------------------
Unique Values of Total_Relationship_Count
[5 6 4 3 2 1]
int64
--------------------------------
Unique Values of Months_Inactive_12_mon
[1 4 2 3 6 0 5]
int64
--------------------------------
Unique Values of Contacts_Count_12_mon
[3 2 0 1 4 5 6]
int64
--------------------------------
Unique Values of Credit_Limit
[12691.  8256.  3418. ...  5409.  5281. 10388.]
float64
--------------------------------
Unique Values of Total_Revolving_Bal
[ 777  864    0 ...  534  476 2241]
int64
--------------------------------
Unique Values of Avg_Open_To_Buy
[11914.  7392.  3418. ... 11831.  5409.  8427.]
float64
--------------------------------
Unique Values of Total_Amt_Chng_Q4_Q1
[1.335 1.541 2.594 ... 0.222 0.204 0.166]
float64
--------------------------------
Unique Values of Total_Trans_Amt
[ 1144  1291  1887 ... 10291  8395 10294]
int64
--------------------------------
Unique Values of Total_Trans_Ct
[ 42  33  20  28  24  31  36  32  26  17  29  27  21  30  16  18  23  22
  40  38  25  43  37  19  35  15  41  57  12  14  34  44  13  47  10  39
  53  50  52  48  49  45  11  55  46  54  60  51  63  58  59  61  78  64
  65  62  67  66  56  69  71  75  74  76  84  82  88  68  70  73  86  72
  79  80  85  81  87  83  91  89  77 103  93  96  99  92  90  94  95  98
 100 102  97 101 104 105 106 107 109 118 108 122 113 112 111 127 114 124
 110 120 125 121 117 126 134 116 119 129 131 115 128 139 123 130 138 132]
int64
--------------------------------
Unique Values of Total_Ct_Chng_Q4_Q1
[1.625 3.714 2.333 2.5   0.846 0.722 0.714 1.182 0.882 0.68  1.364 3.25
 2.    0.611 1.7   0.929 1.143 0.909 0.6   1.571 0.353 0.75  0.833 1.3
 1.    0.9   2.571 1.6   1.667 0.483 1.176 1.2   0.556 0.143 0.474 0.917
 1.333 0.588 0.8   1.923 0.25  0.364 1.417 1.083 1.25  0.5   1.154 0.733
 0.667 2.4   1.05  0.286 0.4   0.522 0.435 1.875 0.966 1.412 0.526 0.818
 1.8   1.636 2.182 0.619 0.933 1.222 0.304 0.727 0.385 1.5   0.789 0.542
 1.1   1.095 0.824 0.391 0.346 3.    1.056 1.118 0.786 0.625 1.533 0.382
 0.355 0.765 0.778 2.2   1.545 0.7   1.211 1.231 0.636 0.455 2.875 1.308
 0.467 1.909 0.571 0.812 2.429 0.706 2.167 0.263 0.429 2.286 0.828 1.467
 0.478 0.867 0.88  1.444 1.273 0.941 0.684 0.591 0.762 0.529 0.615 0.519
 0.421 0.947 1.167 1.105 0.737 1.263 0.538 1.071 0.357 0.407 0.923 1.455
 0.35  2.273 0.69  0.65  0.167 0.647 1.615 0.545 0.875 1.125 0.462 1.294
 1.357 3.5   1.067 1.286 0.524 1.214 0.273 1.538 0.783 0.235 0.607 2.083
 0.632 0.368 0.444 0.76  0.536 0.438 0.423 2.1   0.565 0.719 0.182 1.75
 0.944 0.581 0.333 0.643 0.87  0.692 1.227 0.938 1.833 0.652 1.462 0.583
 0.679 0.375 1.091 2.75  1.385 1.188 0.261 1.312 0.656 1.235 0.958 0.37
 0.059 0.3   0.613 1.778 0.955 0.864 1.429 0.889 1.438 0.481 0.452 1.13
 0.562 1.048 0.409 0.622 0.688 1.217 0.211 0.606 0.655 0.381 1.053 1.316
 0.575 0.85  0.41  0.609 1.579 0.56  0.276 0.533 0.515 0.308 0.852 0.371
 0.214 0.63  0.231 0.406 0.405 0.349 0.857 0.212 0.543 1.059 0.579 0.387
 0.724 0.415 0.895 0.781 0.412 0.649 0.32  0.345 0.367 0.586 0.324 0.306
 0.676 0.708 0.476 0.29  0.55  0.133 0.344 0.52  0.471 0.842 0.654 0.516
 0.464 1.857 0.629 0.963 0.686 0.323 0.585 0.633 0.92  0.441 0.424 0.59
 0.763 0.207 0.314 2.222 1.45  0.469 3.571 0.696 0.741 0.512 1.043 0.568
 0.548 0.194 0.552 0.448 0.651 0.393 0.657 0.682 0.808 1.032 0.577 0.241
 0.425 0.348 0.318 0.292 0.312 0.486 0.969 0.697 0.389 0.44  0.829 0.677
 0.189 0.259 0.72  0.815 1.15  0.806 0.537 0.721 0.531 0.472 0.594 0.773
 0.826 0.906 0.417 0.758 1.107 0.621 0.458 0.267 0.107 0.459 0.71  0.487
 0.95  0.321 0.414 0.742 0.739 0.767 0.394 0.091 0.926 0.618 0.784 0.208
 1.136 0.897 0.593 0.294 0.718 1.375 0.862 0.439 0.839 0.595 1.208 0.96
 0.514 0.433 0.484 1.08  0.931 0.233 0.971 0.957 1.038 0.48  0.731 1.474
 1.062 0.608 1.103 1.111 0.725 1.647 0.774 0.477 0.238 0.967 0.769 0.576
 0.567 1.042 0.759 0.81  1.069 0.574 0.528 0.278 0.703 0.447 0.028 0.297
 1.037 0.269 0.962 0.905 0.111 0.513 0.31  0.614 0.436 0.45  1.48  0.296
 0.879 1.114 0.262 1.278 0.257 0.517 1.36  0.605 1.04  0.711 0.844 0.623
 0.913 0.756 1.045 0.775 0.645 0.793 0.488 0.511 0.811 0.838 0.641 0.646
 0.972 0.559 0.659 0.525 0.038 0.871 0.919 0.179 0.639 0.077 0.564 0.419
 0.853 0.64  0.848 1.033 0.351 0.675 0.743 0.952 1.077 1.087 1.12  0.885
 0.592 0.893 0.265 1.292 0.457 0.771 0.977 0.053 1.318 0.809 0.674 0.968
 0.316 0.15  0.558 0.485 0.735 0.275 0.19  1.381 0.379 0.689 0.561 0.174
 0.217 1.174 0.766 0.683 0.    0.281 0.28  0.492 0.788 0.865 0.881 0.794
 0.712 0.658 0.891 1.24  0.911 0.946 0.2   0.465 0.489 0.541 0.86  0.628
 0.062 0.795 1.722 0.892 0.578 0.704 0.732 0.587 0.956 0.185 0.341 0.58
 0.378 1.036 0.549 0.491 0.702 0.638 0.176 0.912 0.535 0.521 0.653 0.604
 0.73  0.66  1.139 0.509 1.882 0.463 0.634 0.694 1.148 0.757 1.35  0.362
 0.822 0.755 0.395 0.861 0.738 1.133 0.872 0.886 1.156 0.532 1.03  0.453
 0.821 1.034 0.635 0.154 0.903 1.207 1.31  0.523 0.878 0.744 0.317 0.93
 0.24  0.804 0.761 0.54  0.479 0.551 1.4   0.553 0.426 0.816 0.698 0.227
 0.896 0.792 1.051 0.61  0.884 0.408 0.617 0.935 0.361 0.902 0.78  0.841
 0.796 0.975 1.081 0.707 0.422 0.964 0.172 0.805 0.717 0.347 1.138 0.791
 0.681 0.256 1.609 0.868 0.468 0.432 1.121 0.787 0.596 0.976 1.158 1.028
 0.949 0.451 0.456 0.837 1.212 0.673 0.222 0.171 0.51  0.685 0.396 0.388
 0.644 0.914 1.476 0.46  0.547 1.421 0.825 0.729 0.723 1.471 0.939 0.974
 0.943 0.84  0.627 0.13  1.147 0.327 1.065 0.705 1.37  0.854 0.951 0.569
 0.921 0.776 0.927 0.449 0.475 0.97  1.097 0.612 1.024 1.088 0.648 0.242
 0.661 0.745 1.522 0.843 0.907 1.027 1.783 0.62  0.814 1.026 0.851 1.094
 0.431 1.057 0.226 0.736 0.103 1.29  0.925 0.566 0.161 0.303 1.152 1.65
 0.74  1.194 1.226 0.642 1.323 1.025 1.074 0.508 0.49  0.534 0.83  0.978
 1.206 1.054 0.936 0.932 0.105 1.061 1.031 1.478 0.898 0.672 0.188 0.518
 0.953 1.049 1.086 0.691 0.411 1.029 1.419 1.075 0.206 0.973 1.219 1.162
 0.827 1.321 0.343 0.764 0.125 0.119 1.189 1.179 1.258 1.229 1.073 0.074
 1.458 1.172 1.32  1.108 1.16  0.36  1.391 1.583 0.147 1.115 0.359 1.128
 0.915 0.282 0.162 1.303 0.582 1.382 1.171 0.029 1.161 0.192 1.346 0.473
 0.097 0.82  0.557 0.894 1.135 1.367 1.023 0.544 0.589 0.603 0.442 0.295
 0.434 0.554 0.372 0.527 0.709 0.782 0.797 0.695 0.849 0.768 0.863 0.746
 0.597 0.631 0.678 0.887 0.754 0.687 0.699 0.873 0.716 0.934 0.847 0.244
 0.803 0.772 0.859 1.064 0.819 0.573 0.807 0.79  0.817 0.785 0.823 0.836
 0.616 0.831 1.06  1.122 0.866 0.662 0.869 0.779 0.981 0.293 0.855 0.98
 0.671 1.079 0.693 0.77  1.093 1.018 1.022 0.734 0.753 0.726 0.922 0.948
 1.684 0.918]
float64
--------------------------------
Unique Values of Avg_Utilization_Ratio
[0.061 0.105 0.    0.76  0.311 0.066 0.048 0.113 0.144 0.217 0.174 0.195
 0.279 0.23  0.078 0.095 0.788 0.08  0.086 0.152 0.626 0.215 0.093 0.099
 0.285 0.658 0.69  0.282 0.562 0.135 0.544 0.757 0.241 0.077 0.018 0.355
 0.145 0.209 0.793 0.074 0.259 0.591 0.687 0.127 0.667 0.843 0.422 0.156
 0.525 0.587 0.211 0.088 0.111 0.044 0.276 0.704 0.656 0.053 0.051 0.467
 0.698 0.067 0.079 0.287 0.36  0.256 0.719 0.198 0.14  0.035 0.619 0.108
 0.062 0.765 0.963 0.524 0.347 0.45  0.232 0.299 0.085 0.059 0.43  0.62
 0.027 0.169 0.058 0.223 0.057 0.513 0.473 0.047 0.106 0.05  0.03  0.615
 0.15  0.407 0.191 0.096 0.176 0.83  0.412 0.678 0.246 0.271 0.114 0.395
 0.406 0.258 0.178 0.941 0.141 0.118 0.119 0.64  0.432 0.612 0.359 0.309
 0.101 0.607 0.512 0.806 0.463 0.77  0.076 0.133 0.037 0.146 0.171 0.069
 0.837 0.055 0.294 0.39  0.19  0.692 0.503 0.251 0.11  0.087 0.214 0.164
 0.049 0.043 0.679 0.098 0.694 0.039 0.199 0.22  0.13  0.202 0.319 0.165
 0.863 0.665 0.598 0.539 0.472 0.064 0.16  0.42  0.713 0.092 0.336 0.666
 0.147 0.987 0.073 0.88  0.28  0.65  0.761 0.072 0.327 0.459 0.252 0.244
 0.291 0.46  0.489 0.482 0.24  0.197 0.866 0.317 0.762 0.162 0.196 0.734
 0.446 0.262 0.042 0.094 0.308 0.68  0.238 0.753 0.877 0.724 0.117 0.638
 0.102 0.131 0.255 0.716 0.609 0.405 0.154 0.605 0.275 0.06  0.07  0.186
 0.648 0.167 0.153 0.79  0.732 0.123 0.221 0.2   0.063 0.785 0.771 0.224
 0.795 0.187 0.583 0.316 0.447 0.625 0.514 0.557 0.955 0.867 0.846 0.756
 0.31  0.373 0.935 0.155 0.435 0.932 0.829 0.953 0.188 0.82  0.616 0.595
 0.521 0.268 0.09  0.885 0.546 0.569 0.183 0.639 0.329 0.274 0.161 0.865
 0.73  0.134 0.137 0.478 0.361 0.312 0.036 0.243 0.805 0.168 0.103 0.179
 0.529 0.227 0.706 0.075 0.804 0.708 0.766 0.381 0.046 0.428 0.112 0.041
 0.85  0.517 0.72  0.056 0.548 0.436 0.201 0.523 0.081 0.403 0.671 0.752
 0.194 0.657 0.476 0.729 0.911 0.78  0.35  0.636 0.632 0.226 0.798 0.781
 0.148 0.029 0.12  0.651 0.257 0.204 0.231 0.18  0.617 0.458 0.142 0.054
 0.374 0.491 0.216 0.572 0.32  0.212 0.545 0.314 0.393 0.599 0.33  0.663
 0.159 0.185 0.371 0.506 0.448 0.128 0.269 0.333 0.125 0.091 0.53  0.303
 0.682 0.456 0.584 0.337 0.51  0.819 0.543 0.81  0.189 0.213 0.068 0.033
 0.261 0.071 0.41  0.712 0.515 0.593 0.203 0.286 0.457 0.654 0.122 0.345
 0.825 0.1   0.206 0.976 0.17  0.292 0.139 0.109 0.278 0.324 0.745 0.402
 0.397 0.045 0.177 0.611 0.284 0.578 0.318 0.803 0.594 0.684 0.019 0.722
 0.032 0.115 0.511 0.306 0.104 0.219 0.709 0.621 0.082 0.553 0.465 0.707
 0.166 0.859 0.677 0.253 0.586 0.425 0.801 0.084 0.645 0.149 0.343 0.878
 0.304 0.814 0.342 0.848 0.163 0.222 0.469 0.519 0.272 0.325 0.702 0.181
 0.693 0.809 0.479 0.468 0.356 0.811 0.34  0.63  0.372 0.637 0.507 0.749
 0.129 0.674 0.794 0.582 0.464 0.065 0.315 0.691 0.501 0.218 0.56  0.175
 0.5   0.378 0.613 0.313 0.727 0.239 0.603 0.57  0.27  0.034 0.247 0.737
 0.124 0.589 0.534 0.237 0.136 0.789 0.777 0.52  0.653 0.016 0.346 0.721
 0.675 0.138 0.266 0.442 0.326 0.301 0.717 0.023 0.025 0.25  0.281 0.796
 0.296 0.334 0.471 0.571 0.352 0.143 0.608 0.775 0.67  0.321 0.696 0.689
 0.624 0.408 0.157 0.439 0.672 0.302 0.225 0.357 0.527 0.431 0.831 0.755
 0.786 0.026 0.659 0.416 0.451 0.052 0.404 0.394 0.391 0.736 0.854 0.791
 0.126 0.363 0.874 0.297 0.341 0.344 0.208 0.733 0.234 0.116 0.828 0.365
 0.182 0.384 0.526 0.396 0.031 0.516 0.748 0.354 0.349 0.233 0.497 0.248
 0.339 0.132 0.588 0.764 0.705 0.575 0.536 0.021 0.205 0.835 0.549 0.74
 0.889 0.083 0.596 0.735 0.827 0.522 0.711 0.377 0.351 0.242 0.366 0.697
 0.328 0.778 0.743 0.492 0.715 0.623 0.488 0.263 0.568 0.089 0.779 0.47
 0.264 0.415 0.58  0.452 0.289 0.635 0.229 0.75  0.695 0.6   0.784 0.173
 0.822 0.812 0.265 0.574 0.475 0.295 0.662 0.3   0.566 0.994 0.669 0.04
 0.856 0.532 0.461 0.559 0.331 0.602 0.445 0.466 0.597 0.646 0.474 0.305
 0.556 0.742 0.631 0.718 0.606 0.647 0.758 0.644 0.499 0.873 0.245 0.487
 0.558 0.49  0.121 0.869 0.797 0.437 0.772 0.7   0.934 0.857 0.015 0.547
 0.353 0.699 0.495 0.409 0.29  0.293 0.494 0.477 0.235 0.894 0.417 0.881
 0.207 0.928 0.484 0.852 0.038 0.228 0.643 0.655 0.283 0.642 0.581 0.379
 0.542 0.579 0.434 0.44  0.535 0.913 0.776 0.551 0.401 0.273 0.172 0.375
 0.714 0.668 0.362 0.833 0.633 0.783 0.614 0.763 0.844 0.744 0.61  0.453
 0.481 0.563 0.418 0.399 0.348 0.59  0.413 0.498 0.267 0.398 0.386 0.815
 0.249 0.429 0.799 0.751 0.821 0.323 0.107 0.807 0.816 0.99  0.573 0.449
 0.883 0.768 0.925 0.773 0.38  0.604 0.411 0.832 0.184 0.438 0.552 0.792
 0.376 0.641 0.37  0.158 0.426 0.277 0.493 0.629 0.02  0.236 0.21  0.726
 0.531 0.92  0.949 0.628 0.731 0.518 0.358 0.554 0.893 0.943 0.944 0.601
 0.307 0.725 0.368 0.924 0.661 0.151 0.769 0.576 0.424 0.664 0.024 0.922
 0.537 0.884 0.483 0.462 0.899 0.622 0.013 0.954 0.683 0.192 0.774 0.824
 0.858 0.984 0.414 0.561 0.879 0.504 0.509 0.968 0.918 0.836 0.332 0.028
 0.938 0.541 0.48  0.533 0.528 0.254 0.423 0.288 0.369 0.93  0.813 0.915
 0.364 0.688 0.902 0.868 0.942 0.567 0.022 0.703 0.585 0.906 0.754 0.855
 0.839 0.681 0.298 0.872 0.455 0.929 0.008 0.388 0.912 0.322 0.853 0.454
 0.685 0.747 0.66  0.904 0.738 0.485 0.496 0.577 0.927 0.746 0.565 0.634
 0.887 0.845 0.951 0.444 0.427 0.962 0.564 0.851 0.897 0.876 0.421 0.012
 0.649 0.759 0.84  0.842 0.87  0.097 0.983 0.433 0.387 0.441 0.767 0.903
 0.592 0.895 0.896 0.652 0.8   0.4   0.017 0.862 0.849 0.676 0.999 0.921
 0.673 0.948 0.004 0.864 0.55  0.787 0.392 0.443 0.505 0.538 0.71  0.367
 0.985 0.741 0.826 0.817 0.508 0.26  0.618 0.94  0.916 0.823 0.9   0.193
 0.419 0.841 0.919 0.959 0.723 0.486 0.54  0.905 0.385 0.782 0.006 0.502
 0.802 0.875 0.931 0.011 0.926 0.728 0.382 0.335 0.891 0.871 0.701 0.739
 0.383 0.834 0.898 0.389 0.901 0.988 0.907 0.686 0.86  0.882 0.861 0.917
 0.555 0.808 0.338 0.96  0.972 0.01  0.847 0.964 0.886 0.995 0.818 0.958
 0.627 0.992 0.952 0.91  0.978 0.973 0.971 0.945 0.914 0.977 0.956 0.909
 0.005 0.007 0.014 0.009]
float64
--------------------------------


df.isna().sum()

Attrition_Flag              0
Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
dtype: int64


df.describe()


cat_list = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

df_cat = df[cat_list].copy()
df_cat['Education_Level'] = pd.Categorical(df_cat['Education_Level'], ['Uneducated','High School','College','Graduate','Post-Graduate', 'Doctorate', 'Unknown'])
df_cat['Marital_Status'] = pd.Categorical(df_cat['Marital_Status'], ['Single','Married','Divorced','Unknown'])
df_cat['Income_Category'] = pd.Categorical(df_cat['Income_Category'], ['Less than $40K','$40K - $60K','$60K - $80K','$80K - $120K','$120K +', 'Unknown'])
df_cat['Card_Category'] = pd.Categorical(df_cat['Card_Category'], ['Blue','Silver','Gold','Platinum'])

for col in cat_list:
    plt.figure(figsize = (10, 5))
    print(df[col].value_counts())
    sns.histplot(x = col, data = df_cat)
    plt.title("Histogram of %s"%(col))
    plt.show()
    print('------------------------------------------------------------------')

F    5358
M    4769
Name: Gender, dtype: int64

------------------------------------------------------------------
Graduate         3128
High School      2013
Unknown          1519
Uneducated       1487
College          1013
Post-Graduate     516
Doctorate         451
Name: Education_Level, dtype: int64

------------------------------------------------------------------
Married     4687
Single      3943
Unknown      749
Divorced     748
Name: Marital_Status, dtype: int64

------------------------------------------------------------------
Less than $40K    3561
$40K - $60K       1790
$80K - $120K      1535
$60K - $80K       1402
Unknown           1112
$120K +            727
Name: Income_Category, dtype: int64

------------------------------------------------------------------
Blue        9436
Silver       555
Gold         116
Platinum      20
Name: Card_Category, dtype: int64

------------------------------------------------------------------


# Check the proportion of Existing vs Attrited
print(len(df[df['Attrition_Flag'] == 'Existing Customer']))
print(len(df[df['Attrition_Flag'] == 'Attrited Customer']))

8500
1627


# Equivalent random sample from two different Attrition Flag types

X_1 = df[df['Attrition_Flag'] == 'Existing Customer'].sample(n = 800, replace = False, random_state = 100)
X_2 = df[df['Attrition_Flag'] == 'Attrited Customer'].sample(n = 800, replace = False, random_state = 100)


# Concatenate two dataframes into one dataframe
X = pd.concat([X_1, X_2])
X


# First, drop all categorical variables
drop_list = ['Attrition_Flag','Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

# x - Scaled Dataset
x = X.copy()
x = x.drop(drop_list, axis = 1)
x.head()


# Scale numerical variables
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss_array = ss.fit_transform(x)
ss_array

array([[-1.07534812,  0.5175485 , -2.30424022, ..., -0.43210519,
        -0.31752541,  1.23687695],
       [ 1.37558777, -1.01450997,  1.43371546, ..., -0.65405329,
        -0.36241794,  1.76346671],
       [-0.68835825,  1.28357774, -0.75749994, ...,  0.94397298,
         1.89037107, -0.18885586],
       ...,
       [ 0.34361476,  1.28357774,  0.0158702 , ..., -1.4086768 ,
        -1.61124663,  0.11563481],
       [ 2.0205709 , -1.7805392 ,  1.94929555, ..., -0.2101571 ,
         0.86600505, -0.82291292],
       [ 0.85960127, -0.24848073,  1.30482043, ...,  0.10057023,
        -0.02368336, -0.82291292]])


# Need to match the index
x = pd.DataFrame(ss_array, columns = x.columns, index = x.index)
x.head()


# Put categorical variables back into the dataset
for col in cat_list:
    x[col] = X[col]
x.head()


df_q2 = df.copy()
# separate customer and demographic features
cust_dem = ['Customer_Age', 'Gender', 'Dependent_count', 'Education_Level', 'Marital_Status', 'Income_Category']
bank_info = ['Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
             'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy',
             'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 
             'Avg_Utilization_Ratio']
df_bankinfo = df_q2[bank_info]
# create gower matrix for clustering
dist_mat_bankinfo = gower_matrix(df_bankinfo)
dist_mat_bankinfo

array([[0.        , 0.11751616, 0.18762836, ..., 0.18198298, 0.18662769,
        0.28717646],
       [0.11751616, 0.        , 0.18806979, ..., 0.2574561 , 0.2621008 ,
        0.33431634],
       [0.18762836, 0.18806979, 0.        , ..., 0.23276597, 0.20547554,
        0.43145663],
       ...,
       [0.18198298, 0.2574561 , 0.23276597, ..., 0.        , 0.04651431,
        0.22433168],
       [0.18662769, 0.2621008 , 0.20547554, ..., 0.04651431, 0.        ,
        0.26042324],
       [0.28717646, 0.33431634, 0.43145663, ..., 0.22433168, 0.26042324,
        0.        ]], dtype=float32)


# perform t-SNE to determine clusterability of customer demographics
np.random.seed(660510475)
plt.figure(figsize = (20, 20))
fig, ax = plt.subplots(2, 6, figsize = (30, 10))
i = j = 0
for perp in [5, 10, 20, 30, 40, 50]:
    random_st_r1 = np.random.uniform(0, 100000, 2)
    i = 0
    for ran_state in random_st_r1.astype(int):
        tsne_r1 = TSNE(n_components = 2, perplexity = perp, random_state = ran_state)
        data_tsne_r1 = tsne_r1.fit_transform(dist_mat_bankinfo)
        df_tsne_r1 = pd.DataFrame(data_tsne_r1, columns = ['x', 'y'])
        ax[i, j].scatter(df_tsne_r1['x'], df_tsne_r1['y'])
        ax[i, j].set_title("t-SNE\nPerplexity = {0}, Random State = {1}".format(perp, ran_state))
        i += 1
    j += 1    
plt.show()

<Figure size 1440x1440 with 0 Axes>


hopkins(dist_mat_bankinfo, int(dist_mat_bankinfo.shape[0]/10))

0.08858811655880006


# t-SNE
perp_r1 = 40
ran_state_r1 = 67458
tsne_r1 = TSNE(n_components = 2, perplexity = perp_r1, random_state = ran_state_r1)
data_tsne_r1 = tsne_r1.fit_transform(dist_mat_bankinfo)
df_tsne_r1 = pd.DataFrame(data_tsne_r1, columns = ['x', 'y'])
plt.scatter(df_tsne_r1['x'], df_tsne_r1['y'])
plt.title("t-SNE\nPerplexity = {0}, Random State = {1}".format(perp_r1, ran_state_r1))   
plt.show()


for col in bank_info:
    sns.scatterplot(x = 'x', y = 'y', data = df_tsne_r1, hue = df_q2[col])
    plt.title("t-SNE\nPerplexity = {0}, Random State = {1}".format(perp_r1, ran_state_r1))
    plt.legend(title = col, 
               loc = (1, 0))
    plt.show()


# X - Unscaled Dataset, x - Scaled Dataset

from gower import gower_matrix

dist_mat = gower_matrix(x)
dist_mat

array([[0.        , 0.22388041, 0.23646867, ..., 0.3480109 , 0.4914987 ,
        0.26662782],
       [0.22388041, 0.        , 0.30855188, ..., 0.41026807, 0.41364172,
        0.27406055],
       [0.23646867, 0.30855188, 0.        , ..., 0.3848821 , 0.39061034,
        0.26088417],
       ...,
       [0.3480109 , 0.41026807, 0.3848821 , ..., 0.        , 0.29175434,
        0.3959032 ],
       [0.4914987 , 0.41364172, 0.39061034, ..., 0.29175434, 0.        ,
        0.2725757 ],
       [0.26662782, 0.27406055, 0.26088417, ..., 0.3959032 , 0.2725757 ,
        0.        ]], dtype=float32)


from pyclustertend import hopkins

num_trials=5
hopkins_stats=[]
for i in range(0,num_trials):
    n = len(dist_mat)
    p = int(0.1 * n)
    hopkins_stats.append(hopkins(dist_mat,p))
print(hopkins_stats)

[0.15794299960525593, 0.15723965660010883, 0.15875196247402917, 0.16355395486362245, 0.15610077200721056]


from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

for perp in [5,10, 20, 30, 40, 50]:
    for rs in [23,88]:
        tsne = TSNE(n_components=2, perplexity=perp, random_state=rs, metric='precomputed')
        data_tsne = tsne.fit_transform(dist_mat)
        
        # Need to match the index
        df_tsne = pd.DataFrame(data_tsne, columns=['x_projected', 'y_projected'], index = x.index)
        df_combo = pd.concat([x, df_tsne], axis=1)
        sns.scatterplot(x='x_projected',y='y_projected', data=df_combo)
        plt.title('t-SNE Plot with Perplexity Value %s and Random State %s' %(perp, rs))
        plt.show()
    print('--------------------------------------------')

--------------------------------------------

--------------------------------------------

--------------------------------------------

--------------------------------------------

--------------------------------------------

--------------------------------------------


tsne = TSNE(n_components=2, perplexity=50, random_state=23, metric='precomputed')
data_tsne = tsne.fit_transform(dist_mat)
df_tsne = pd.DataFrame(data_tsne, columns=['x_projected', 'y_projected'], index = x.index)
df_combo = pd.concat([x, df_tsne], axis=1)
sns.scatterplot(x='x_projected',y='y_projected', data=df_combo)
plt.title('t-SNE Plot with Perplexity Value %s and Random State %s' %(50, 23))
plt.show()


for col in x.columns:
    sns.scatterplot(x='x_projected',y='y_projected', data=df_combo, hue=col)
    plt.title("Scatterplot of %s"%(col))
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()


x.head()


from kmodes.kprototypes import KPrototypes

cost = []
for num_clusters in list(range(1,6)):
    kp = KPrototypes(n_clusters=num_clusters, random_state=100, gamma = 0.5)
    kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
    cost.append(kp.cost_)
    print(num_clusters)


plt.plot(list(range(1,6)),cost)
plt.xlabel('Number of Clusters')
plt.ylabel('Cost')
plt.title('Elbow Method for k-Prototypes Gamma = 0.5')
plt.show()


gamma=0.5

for k in range(1,6):

    #Clustering with k-prototypes
    kp = KPrototypes(n_clusters=k, random_state=100, gamma = 0.5)
    df_combo['predicted_cluster']=kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
    
    #Map the resulting cluster labels onto our chosen t-SNE plot
    sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), data=df_combo)
    plt.title('t-SNE Plot with K-Prototype with k=%s Clusters and Gamma = %s' %(k, gamma))
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()


cost = []
for num_clusters in list(range(1,6)):
    kp = KPrototypes(n_clusters=num_clusters, random_state=100, gamma = 1)
    kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
    cost.append(kp.cost_)
    print(num_clusters)


plt.plot(list(range(1,6)),cost)
plt.xlabel('Number of Clusters')
plt.ylabel('Cost')
plt.title('Elbow Method for k-Prototypes Gamma = 1')
plt.show()


gamma=1

for k in range(1,6):

    #Clustering with k-prototypes
    kp = KPrototypes(n_clusters=k, random_state=100, gamma = 1)
    df_combo['predicted_cluster']=kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
    
    #Map the resulting cluster labels onto our chosen t-SNE plot
    sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), data=df_combo)
    plt.title('t-SNE Plot with K-Prototype with k=%s Clusters and Gamma = %s' %(k, gamma))
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()


cost = []
for num_clusters in list(range(1,6)):
    kp = KPrototypes(n_clusters=num_clusters, random_state=100, gamma = 5)
    kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
    cost.append(kp.cost_)
    print(num_clusters)


plt.plot(list(range(1,6)),cost)
plt.xlabel('Number of Clusters')
plt.ylabel('Cost')
plt.title('Elbow Method for k-Prototypes Gamma = 5')
plt.show()


gamma=5

for k in range(1,6):

    #Clustering with k-prototypes
    kp = KPrototypes(n_clusters=k, random_state=100, gamma = 5)
    df_combo['predicted_cluster']=kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
    
    #Map the resulting cluster labels onto our chosen t-SNE plot
    sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), data=df_combo)
    plt.title('t-SNE Plot with K-Prototype with k=%s Clusters and Gamma = %s' %(k, gamma))
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()


cost = []
for num_clusters in list(range(1,6)):
    kp = KPrototypes(n_clusters=num_clusters, random_state=100, gamma = 100)
    kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
    cost.append(kp.cost_)
    print(num_clusters)


plt.plot(list(range(1,6)),cost)
plt.xlabel('Number of Clusters')
plt.ylabel('Cost')
plt.title('Elbow Method for k-Prototypes Gamma = 100')
plt.show()


gamma=100

for k in range(1,6):

    #Clustering with k-prototypes
    kp = KPrototypes(n_clusters=k, random_state=100, gamma = 100)
    df_combo['predicted_cluster']=kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
    
    #Map the resulting cluster labels onto our chosen t-SNE plot
    sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), data=df_combo)
    plt.title('t-SNE Plot with K-Prototype with k=%s Clusters and Gamma = %s' %(k, gamma))
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()


kp = KPrototypes(n_clusters=2, random_state=100, gamma = 100)
df_combo['predicted_cluster']=kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
df_combo.head()


#Map the resulting cluster labels onto our chosen t-SNE plot
k=2

sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), data=df_combo)
plt.title('t-SNE Plot with k-Prototypes with k=%s Clusters and gamma=100' %(k))
plt.legend(bbox_to_anchor=(1,1))
plt.show()


from sklearn.metrics import silhouette_samples, silhouette_score

def show_silhouette_plots(X,cluster_labels):

    # This package allows us to use "color maps" in our visualizations
    import matplotlib.cm as cm
    
    #How many clusters in your clustering?
    n_clusters=len(np.unique(cluster_labels))    

    # Create a subplot with 1 row and 2 columns
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient fcan range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])


    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]
    

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])


    plt.show()
    
    return


kp = KPrototypes(n_clusters=2, random_state=100, gamma = 100)
cluster_labels=kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])
cluster_labels

array([0, 0, 0, ..., 1, 1, 0], dtype=uint16)


show_silhouette_plots(dist_mat, cluster_labels)

For n_clusters = 2 The average silhouette_score is : 0.22574311


# Silhouette score of every observation
df_combo['silhoutte_scores'] = silhouette_samples(dist_mat, cluster_labels)

df_combo.head()


df_combo.sort_values(by=['silhoutte_scores'], ascending=False)


pd.concat([X.loc[[2696]], X.loc[[5445]]])


kp.cluster_centroids_

array([['0.0010891548145757226', '-0.003828747625808713',
        '-0.004336322901852015', '0.06422083050091101',
        '-0.009849195546806175', '-0.020953981245417797',
        '-0.39415468974810103', '-0.013402635541569534',
        '-0.3928763683323007', '0.009038376377362404',
        '-0.09001427737062087', '-0.042150822477012566',
        '-0.007247537012745263', '0.18487639997164096', 'F', 'Graduate',
        'Married', 'Less than $40K', 'Blue'],
       ['-0.0019521151737683048', '0.006862345221127362',
        '0.0077720831068100234', '-0.11510435065346653',
        '0.017652921163300077', '0.03755626306988365',
        '0.7064517737719015', '0.024021826703650023',
        '0.7041606113041412', '-0.016199672843894047',
        '0.1613344901564182', '0.07554780922145152',
        '0.012989913633664599', '-0.3313578756908844', 'M',
        'High School', 'Single', '$80K - $120K', 'Blue']], dtype='<U32')


# Numerical Attributes
num_list = list(x.columns[0:14])
cat_list = list(x.columns[14:])

df_proto_scaled = pd.DataFrame(kp.cluster_centroids_, columns = x.columns)
df_proto_scaled


df_proto_num = df_proto_scaled.copy()
df_proto_num = df_proto_num.drop(cat_list, axis = 1)
df_proto_num


df_proto_num.dtypes

Customer_Age                object
Dependent_count             object
Months_on_book              object
Total_Relationship_Count    object
Months_Inactive_12_mon      object
Contacts_Count_12_mon       object
Credit_Limit                object
Total_Revolving_Bal         object
Avg_Open_To_Buy             object
Total_Amt_Chng_Q4_Q1        object
Total_Trans_Amt             object
Total_Trans_Ct              object
Total_Ct_Chng_Q4_Q1         object
Avg_Utilization_Ratio       object
dtype: object


df_proto_num = df_proto_num.astype('float64')
df_proto_num.dtypes

Customer_Age                float64
Dependent_count             float64
Months_on_book              float64
Total_Relationship_Count    float64
Months_Inactive_12_mon      float64
Contacts_Count_12_mon       float64
Credit_Limit                float64
Total_Revolving_Bal         float64
Avg_Open_To_Buy             float64
Total_Amt_Chng_Q4_Q1        float64
Total_Trans_Amt             float64
Total_Trans_Ct              float64
Total_Ct_Chng_Q4_Q1         float64
Avg_Utilization_Ratio       float64
dtype: object


x_ss=ss.inverse_transform(df_proto_num)
x_ss

array([[4.63446933e+01, 2.31937683e+00, 3.58432327e+01, 3.69814995e+00,
        2.47517040e+00, 2.63291139e+00, 4.86019133e+03, 9.52834469e+02,
        3.90735686e+03, 7.37499513e-01, 3.59402045e+03, 5.57848101e+01,
        6.43027264e-01, 2.81329114e-01],
       [4.63211169e+01, 2.33333333e+00, 3.59371728e+01, 3.41535777e+00,
        2.50261780e+00, 2.69982548e+00, 1.47775428e+04, 9.86137871e+02,
        1.37914049e+04, 7.31982548e-01, 4.36969634e+03, 5.84363002e+01,
        6.47986038e-01, 1.37219895e-01]])


df_proto_num=pd.DataFrame(x_ss, columns=num_list)
df_proto_num


df_proto_cat = df_proto_scaled.copy()
df_proto_cat = df_proto_cat.drop(num_list, axis = 1)
df_proto_cat


df_proto=pd.concat([df_proto_num, df_proto_cat], axis=1)
df_proto


X['k_proto_predicted_cluster'] = df_combo['predicted_cluster']
X


for col in cat_list:
    ctab=pd.crosstab(X['k_proto_predicted_cluster'], X[col], normalize='index')
    print(ctab)
    ctab.plot.bar()
    plt.title(col)
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()
    print('----------------------------------------------------')

Gender                            F         M
k_proto_predicted_cluster                    
0                          0.831548  0.168452
1                          0.080279  0.919721

----------------------------------------------------
Education_Level             College  Doctorate  Graduate  High School  \
k_proto_predicted_cluster                                               
0                          0.099318   0.054528  0.362220     0.150925   
1                          0.108202   0.038394  0.198953     0.284468   

Education_Level            Post-Graduate  Uneducated   Unknown  
k_proto_predicted_cluster                                       
0                               0.053554    0.136319  0.143135  
1                               0.055846    0.148342  0.165794

----------------------------------------------------
Marital_Status             Divorced   Married    Single   Unknown
k_proto_predicted_cluster                                        
0                          0.084713  0.536514  0.317429  0.061344
1                          0.076789  0.293194  0.530541  0.099476

----------------------------------------------------
Income_Category             $120K +  $40K - $60K  $60K - $80K  $80K - $120K  \
k_proto_predicted_cluster                                                     
0                          0.020448     0.189873     0.060370      0.017527   
1                          0.160558     0.148342     0.232112      0.375218   

Income_Category            Less than $40K   Unknown  
k_proto_predicted_cluster                            
0                                0.557936  0.153846  
1                                0.034904  0.048866

----------------------------------------------------
Card_Category                  Blue      Gold  Platinum    Silver
k_proto_predicted_cluster                                        
0                          0.962025  0.003895  0.002921  0.031159
1                          0.876091  0.017452  0.005236  0.101222

----------------------------------------------------


for col in num_list:
    sns.boxplot(x='k_proto_predicted_cluster', y=col, data=X)
    plt.title(col)
    plt.show()


# The values of 'k_proto_predicted_cluster' are either 0 or 1, we need to convert them into 'Existing' or 'Attrited'
X['k_proto_predicted_cluster'] = X['k_proto_predicted_cluster'].map({1: 'Existing Customer', 0:'Attrited Customer'})
X


from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score

print(homogeneity_score(X['Attrition_Flag'], X['k_proto_predicted_cluster']))
print(completeness_score(X['Attrition_Flag'], X['k_proto_predicted_cluster']))
print(v_measure_score(X['Attrition_Flag'], X['k_proto_predicted_cluster']))

3.0645283160971086e-05
3.2562713008623766e-05
3.157491523576353e-05


from sklearn.metrics import adjusted_rand_score

adjusted_rand_score(X['Attrition_Flag'], X['k_proto_predicted_cluster'])

-0.0005362839934900065


k = 2

kp = KPrototypes(n_clusters=k, random_state=100, gamma = 100)
df_combo['predicted_cluster']=kp.fit_predict(x, categorical=[14, 15, 16, 17, 18])

sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), style=X['Attrition_Flag'], data=df_combo)
plt.title('t-SNE Plot with K-Prototypes with k=%s Clusters and gamma=100' %(k))
plt.legend(bbox_to_anchor=(1,1))
plt.show()


sil_scores_r1 = {}

# complete linkage
sil_ = []
for k in range(2, 10):
    hac = AgglomerativeClustering(n_clusters = k, affinity = 'precomputed', linkage = 'complete')
    Y_pred = hac.fit_predict(dist_mat_bankinfo)
    sil_.append(silhouette_score(dist_mat_bankinfo, Y_pred))
    df_q2['compl_clusters_{}'.format(k)] = hac.fit_predict(dist_mat_bankinfo)
sil_scores_r1['complete'] = sil_

# average linkage
sil_ = []
for k in range(2, 10):
    hac = AgglomerativeClustering(n_clusters = k, affinity = 'precomputed', linkage = 'average')
    Y_pred = hac.fit_predict(dist_mat_bankinfo)
    sil_.append(silhouette_score(dist_mat_bankinfo, Y_pred))
    df_q2['avg_clusters_{}'.format(k)] = hac.fit_predict(dist_mat_bankinfo)
sil_scores_r1['average'] = sil_

# single linkage
sil_ = []
for k in range(2, 10):
    hac = AgglomerativeClustering(n_clusters = k, affinity = 'precomputed', linkage = 'single')
    Y_pred = hac.fit_predict(dist_mat_bankinfo)
    sil_.append(silhouette_score(dist_mat_bankinfo, Y_pred))
    df_q2['single_clusters_{}'.format(k)] = hac.fit_predict(dist_mat_bankinfo)
sil_scores_r1['single'] = sil_

sil_scores_df_hac_r1 = pd.DataFrame.from_dict(sil_scores_r1)
sil_hac_r1_melted = pd.melt(sil_scores_df_hac_r1)
sil_hac_r1_melted['clusters'] = list(range(2, 10)) + list(range(2, 10)) + list(range(2, 10))

fig, ax = plt.subplots(8, 3, figsize = (24, 60))
for k in range(2, 10):
    colors = {'0' : 'red',
              '1' : 'green',
              '2' : 'blue',
              '3' : 'orange',
              '4' : 'purple',
              '5' : 'black',
              '6' : 'gray',
              '7' : 'yellow',
              '8' : 'cyan'}
    ax[k - 2, 0].scatter(df_tsne_r1['x'], df_tsne_r1['y'], c = df_q2['compl_clusters_{}'.format(k)].astype(str).map(colors))
    ax[k - 2, 0].set_title("Complete, {} clusters".format(k), fontsize = 14)
    ax[k - 2, 1].scatter(df_tsne_r1['x'], df_tsne_r1['y'], c = df_q2['avg_clusters_{}'.format(k)].astype(str).map(colors))
    ax[k - 2, 1].set_title("Average, {} clusters".format(k), fontsize = 14)
    ax[k - 2, 2].scatter(df_tsne_r1['x'], df_tsne_r1['y'], c = df_q2['single_clusters_{}'.format(k)].astype(str).map(colors))
    ax[k - 2, 2].set_title("Single, {} clusters".format(k), fontsize = 14)
plt.show()


sns.lineplot(x = "clusters", y = "value", hue = "variable", data = sil_hac_r1_melted)
plt.title("Hierarchical Agglomerative Clustering")
plt.xlabel("Clusters")
plt.ylabel("Silhouette Score")
plt.legend(title = "Linkage")
plt.show()


for col in bank_info:
    if df_q2[col].dtype in ['int64', 'float64']:
        plt.figure(figsize = (8, 6))
        sns.boxplot(x = 'avg_clusters_2', y = col, data = df_q2)
        plt.show()
    else:
        plt.figure(figsize = (8, 6))
        sns.countplot(x = 'avg_clusters_2', hue = col, data = df_q2)
        plt.show()


for col in bank_info:
    if df_q2[col].dtype in ['int64', 'float64']:
        plt.figure(figsize = (8, 6))
        sns.boxplot(x = 'compl_clusters_6', y = col, data = df_q2)
        plt.show()
    else:
        plt.figure(figsize = (8, 6))
        sns.countplot(x = 'compl_clusters_6', hue = col, data = df_q2)
        plt.show()


for col in cust_dem:
    if df_q2[col].dtype in ['int64', 'float64']:
        plt.figure(figsize = (8, 6))
        sns.boxplot(x = 'compl_clusters_6', y = col, data = df_q2)
        plt.show()
    else:
        plt.figure(figsize = (8, 6))
        sns.countplot(x = 'compl_clusters_6', hue = col, data = df_q2)
        plt.show()


# average linkage clustering card category percents
round(np.divide(pd.crosstab(df_q2['avg_clusters_2'], df_q2['Card_Category']).T, np.array(df_q2.groupby('avg_clusters_2').size())).T*100, 2)


# complete linkage bins
pd.crosstab(df_q2['compl_clusters_6'], df_q2['Card_Category']).sum(1)

compl_clusters_6
0     747
1    3618
2     344
3    4312
4     783
5     323
dtype: int64


# complete linkage card category percentages
round(np.divide(pd.crosstab(df_q2['compl_clusters_6'], df_q2['Card_Category']).T, np.array(df_q2.groupby('compl_clusters_6').size())).T*100, 2)


from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram

dm=squareform(dist_mat, force = 'tovector')
dm

array([0.22388041, 0.23646867, 0.13484037, ..., 0.29175434, 0.3959032 ,
       0.2725757 ], dtype=float32)


Z = linkage(dm, method='single')
Z

array([[1.03600000e+03, 1.51400000e+03, 1.03925429e-02, 2.00000000e+00],
       [1.38300000e+03, 1.49800000e+03, 1.35198161e-02, 2.00000000e+00],
       [1.24800000e+03, 1.43200000e+03, 1.77286211e-02, 2.00000000e+00],
       ...,
       [3.23000000e+02, 3.19500000e+03, 1.87205940e-01, 1.59700000e+03],
       [3.06800000e+03, 3.19600000e+03, 1.89704299e-01, 1.59900000e+03],
       [5.67000000e+02, 3.19700000e+03, 2.08067313e-01, 1.60000000e+03]])


fig, ax = plt.subplots(figsize=(25, 30))
d = dendrogram(Z, orientation='right', ax=ax, truncate_mode='lastp', p=200, no_labels=True)
ax.set_xlabel('Dissimilarity', fontsize=18)
ax.set_ylabel('Bank Customers', fontsize=18)
plt.yticks(fontsize=12)
plt.title('Single Linkage', fontsize=18)
plt.show()


from sklearn.cluster import AgglomerativeClustering

for k in range(2, 11):
    #Clustering from dendrogram with k clusters
    hac = AgglomerativeClustering(n_clusters=k, affinity='precomputed', linkage='single')
    df_combo['predicted_cluster'] = hac.fit_predict(dist_mat)
    
    #Map the resulting cluster labels onto our chosen t-SNE plot
    sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), data=df_combo)
    plt.title('t-SNE Plot with Single Linkage Clustering with k=%s Clusters' %(k))
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()


Z = linkage(dm, method='complete')
Z

array([[1.03600000e+03, 1.51400000e+03, 1.03925429e-02, 2.00000000e+00],
       [1.38300000e+03, 1.49800000e+03, 1.35198161e-02, 2.00000000e+00],
       [1.24800000e+03, 1.43200000e+03, 1.77286211e-02, 2.00000000e+00],
       ...,
       [3.19100000e+03, 3.19300000e+03, 6.00329041e-01, 9.04000000e+02],
       [3.18800000e+03, 3.19500000e+03, 6.47618771e-01, 6.96000000e+02],
       [3.19600000e+03, 3.19700000e+03, 6.97780252e-01, 1.60000000e+03]])


fig, ax = plt.subplots(figsize=(25, 30))
d = dendrogram(Z, orientation='right', ax=ax, truncate_mode='lastp', p=200, no_labels=True)
ax.set_xlabel('Dissimilarity', fontsize=18)
ax.set_ylabel('Bank Customers', fontsize=18)
plt.yticks(fontsize=12)
plt.title('Complete Linkage', fontsize=18)
plt.show()


for k in range(2,11):
    #Clustering from dendrogram with k clusters
    hac = AgglomerativeClustering(n_clusters=k, affinity='precomputed', linkage='complete')
    df_combo['predicted_cluster'] = hac.fit_predict(dist_mat)

    #Map the resulting cluster labels onto our chosen t-SNE plot
    sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), data=df_combo)
    plt.title('t-SNE Plot with Complete Linkage Clustering with k=%s Clusters' %(k))
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()


Z = linkage(dm, method='average')
Z

array([[1.03600000e+03, 1.51400000e+03, 1.03925429e-02, 2.00000000e+00],
       [1.38300000e+03, 1.49800000e+03, 1.35198161e-02, 2.00000000e+00],
       [1.24800000e+03, 1.43200000e+03, 1.77286211e-02, 2.00000000e+00],
       ...,
       [3.09700000e+03, 3.19000000e+03, 3.52305469e-01, 1.19000000e+02],
       [3.18900000e+03, 3.19500000e+03, 3.61188724e-01, 1.48100000e+03],
       [3.19600000e+03, 3.19700000e+03, 4.04739148e-01, 1.60000000e+03]])


fig, ax = plt.subplots(figsize=(25, 30))
d = dendrogram(Z, orientation='right', ax=ax, truncate_mode='lastp', p=200, no_labels=True)
ax.set_xlabel('Dissimilarity', fontsize=18)
ax.set_ylabel('Bank Customers', fontsize=18)
plt.yticks(fontsize=12)
plt.title('Average Linkage', fontsize=18)
plt.show()


for k in range(2,11):
    #Clustering from dendrogram with k clusters
    hac = AgglomerativeClustering(n_clusters=k, affinity='precomputed', linkage='average')
    df_combo['predicted_cluster'] = hac.fit_predict(dist_mat)
    
    #Map the resulting cluster labels onto our chosen t-SNE plot
    sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), data=df_combo)
    plt.title('t-SNE Plot with Average Linkage Clustering with k=%s Clusters' %(k))
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()


k = 2

hac = AgglomerativeClustering(n_clusters=k, affinity='precomputed', linkage='complete')
df_combo['predicted_cluster'] = hac.fit_predict(dist_mat)
    
#Map the resulting cluster labels onto our chosen t-SNE plot
sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), data=df_combo)
plt.title('t-SNE Plot with Average Linkage Clustering with k=%s Clusters' %(k))
plt.legend(bbox_to_anchor=(1,1))
plt.show()


X['hac_complete_predicted_cluster'] = df_combo['predicted_cluster']
X


for col in cat_list:
    ctab=pd.crosstab(X['hac_complete_predicted_cluster'], X[col], normalize='index')
    print(ctab)
    ctab.plot.bar()
    plt.title(col)
    plt.legend(bbox_to_anchor=(1,1))
    plt.show()
    print('----------------------------------------------------')

Gender                                 F         M
hac_complete_predicted_cluster                    
0                               0.011494  0.988506
1                               0.986726  0.013274

----------------------------------------------------
Education_Level                  College  Doctorate  Graduate  High School  \
hac_complete_predicted_cluster                                               
0                               0.106322   0.043103  0.310345     0.192529   
1                               0.099558   0.053097  0.298673     0.203540   

Education_Level                 Post-Graduate  Uneducated   Unknown  
hac_complete_predicted_cluster                                       
0                                    0.060345    0.136494  0.150862  
1                                    0.049779    0.143805  0.151549

----------------------------------------------------
Marital_Status                  Divorced   Married    Single   Unknown
hac_complete_predicted_cluster                                        
0                               0.070402  0.443966  0.390805  0.094828
1                               0.090708  0.453540  0.396018  0.059735

----------------------------------------------------
Income_Category                  $120K +  $40K - $60K  $60K - $80K  \
hac_complete_predicted_cluster                                       
0                               0.162356     0.155172     0.280172   
1                               0.000000     0.190265     0.000000   

Income_Category                 $80K - $120K  Less than $40K   Unknown  
hac_complete_predicted_cluster                                          
0                                    0.33477        0.058908  0.008621  
1                                    0.00000        0.610619  0.199115

----------------------------------------------------
Card_Category                       Blue      Gold  Platinum    Silver
hac_complete_predicted_cluster                                        
0                               0.909483  0.012931  0.002874  0.074713
1                               0.948009  0.005531  0.004425  0.042035

----------------------------------------------------


for col in num_list:
    sns.boxplot(x='hac_complete_predicted_cluster', y=col, data=X)
    plt.title(col)
    plt.show()


# The values of 'hac_complete_predicted_cluster' are either 0 or 1, we need to convert them into 'Existing' and 'Attrited'
X['hac_complete_predicted_cluster'] = X['hac_complete_predicted_cluster'].map({0: 'Existing Customer', 1:'Attrited Customer'})
X


print(homogeneity_score(X['Attrition_Flag'], X['hac_complete_predicted_cluster']))
print(completeness_score(X['Attrition_Flag'], X['hac_complete_predicted_cluster']))
print(v_measure_score(X['Attrition_Flag'], X['hac_complete_predicted_cluster']))

0.001486385090930868
0.0015047815642277613
0.0014955267559688395


adjusted_rand_score(X['Attrition_Flag'], X['hac_complete_predicted_cluster'])

0.0014110457780515337


k = 2

hac = AgglomerativeClustering(n_clusters=k, affinity='precomputed', linkage='complete')
df_combo['predicted_cluster'] = hac.fit_predict(dist_mat)

sns.scatterplot(x='x_projected',y='y_projected', hue='predicted_cluster', palette=sns.color_palette("husl", k), style=X['Attrition_Flag'], data=df_combo)
plt.title('t-SNE Plot with Complete Linkage Clustering with k=%s Clusters' %(k))
plt.legend(bbox_to_anchor=(1,1))
plt.show()

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
count	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000	10127.000000
mean	46.325960	2.346203	35.928409	3.812580	2.341167	2.455317	8631.953698	1162.814061	7469.139637	0.759941	4404.086304	64.858695	0.712222	0.274894
std	8.016814	1.298908	7.986416	1.554408	1.010622	1.106225	9088.776650	814.987335	9090.685324	0.219207	3397.129254	23.472570	0.238086	0.275691
min	26.000000	0.000000	13.000000	1.000000	0.000000	0.000000	1438.300000	0.000000	3.000000	0.000000	510.000000	10.000000	0.000000	0.000000
25%	41.000000	1.000000	31.000000	3.000000	2.000000	2.000000	2555.000000	359.000000	1324.500000	0.631000	2155.500000	45.000000	0.582000	0.023000
50%	46.000000	2.000000	36.000000	4.000000	2.000000	2.000000	4549.000000	1276.000000	3474.000000	0.736000	3899.000000	67.000000	0.702000	0.176000
75%	52.000000	3.000000	40.000000	5.000000	3.000000	3.000000	11067.500000	1784.000000	9859.000000	0.859000	4741.000000	81.000000	0.818000	0.503000
max	73.000000	5.000000	56.000000	6.000000	6.000000	6.000000	34516.000000	2517.000000	34516.000000	3.397000	18484.000000	139.000000	3.714000	0.999000

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
1237	-1.075348	0.517549	-2.304240	1.523878	0.516027	-0.574377	-0.690154	0.331772	-0.723002	0.688372	-0.671670	-0.432105	-0.317525	1.236877
1201	1.375588	-1.014510	1.433715	0.255631	-1.487960	0.300031	-0.554428	1.689254	-0.721337	-0.629120	-0.756892	-0.654053	-0.362418	1.763467
6921	-0.688358	1.283578	-0.757500	0.889755	-0.485967	-0.574377	0.080698	0.736320	0.007982	0.171440	0.260265	0.943973	1.890371	-0.188856
6133	-0.559362	1.283578	0.015870	1.523878	-0.485967	-1.448786	-0.773909	-1.084144	-0.666919	0.272082	0.169858	0.588856	-0.068576	-0.822913
4396	-0.559362	0.517549	0.015870	0.255631	0.516027	-1.448786	0.104226	0.581243	0.046829	-1.242119	0.397657	0.144960	0.282402	-0.256918

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Gender	Education_Level	Marital_Status	Income_Category	Card_Category
1237	-1.075348	0.517549	-2.304240	1.523878	0.516027	-0.574377	-0.690154	0.331772	-0.723002	0.688372	-0.671670	-0.432105	-0.317525	1.236877	F	Graduate	Married	Less than $40K	Blue
1201	1.375588	-1.014510	1.433715	0.255631	-1.487960	0.300031	-0.554428	1.689254	-0.721337	-0.629120	-0.756892	-0.654053	-0.362418	1.763467	F	Unknown	Married	Less than $40K	Blue
6921	-0.688358	1.283578	-0.757500	0.889755	-0.485967	-0.574377	0.080698	0.736320	0.007982	0.171440	0.260265	0.943973	1.890371	-0.188856	F	High School	Married	Unknown	Blue
6133	-0.559362	1.283578	0.015870	1.523878	-0.485967	-1.448786	-0.773909	-1.084144	-0.666919	0.272082	0.169858	0.588856	-0.068576	-0.822913	F	Graduate	Married	Less than $40K	Blue
4396	-0.559362	0.517549	0.015870	0.255631	0.516027	-1.448786	0.104226	0.581243	0.046829	-1.242119	0.397657	0.144960	0.282402	-0.256918	F	High School	Married	Less than $40K	Blue

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Gender	Education_Level	Marital_Status	Income_Category	Card_Category
1237	-1.075348	0.517549	-2.304240	1.523878	0.516027	-0.574377	-0.690154	0.331772	-0.723002	0.688372	-0.671670	-0.432105	-0.317525	1.236877	F	Graduate	Married	Less than $40K	Blue
1201	1.375588	-1.014510	1.433715	0.255631	-1.487960	0.300031	-0.554428	1.689254	-0.721337	-0.629120	-0.756892	-0.654053	-0.362418	1.763467	F	Unknown	Married	Less than $40K	Blue
6921	-0.688358	1.283578	-0.757500	0.889755	-0.485967	-0.574377	0.080698	0.736320	0.007982	0.171440	0.260265	0.943973	1.890371	-0.188856	F	High School	Married	Unknown	Blue
6133	-0.559362	1.283578	0.015870	1.523878	-0.485967	-1.448786	-0.773909	-1.084144	-0.666919	0.272082	0.169858	0.588856	-0.068576	-0.822913	F	Graduate	Married	Less than $40K	Blue
4396	-0.559362	0.517549	0.015870	0.255631	0.516027	-1.448786	0.104226	0.581243	0.046829	-1.242119	0.397657	0.144960	0.282402	-0.256918	F	High School	Married	Less than $40K	Blue

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	...	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Gender	Education_Level	Marital_Status	Income_Category	Card_Category	x_projected	y_projected
1237	-1.075348	0.517549	-2.304240	1.523878	0.516027	-0.574377	-0.690154	0.331772	-0.723002	0.688372	...	-0.317525	1.236877	F	Graduate	Married	Less than $40K	Blue	40.452370	-3.922722
1201	1.375588	-1.014510	1.433715	0.255631	-1.487960	0.300031	-0.554428	1.689254	-0.721337	-0.629120	...	-0.362418	1.763467	F	Unknown	Married	Less than $40K	Blue	48.825188	-5.651932
6921	-0.688358	1.283578	-0.757500	0.889755	-0.485967	-0.574377	0.080698	0.736320	0.007982	0.171440	...	1.890371	-0.188856	F	High School	Married	Unknown	Blue	32.771870	8.572495
6133	-0.559362	1.283578	0.015870	1.523878	-0.485967	-1.448786	-0.773909	-1.084144	-0.666919	0.272082	...	-0.068576	-0.822913	F	Graduate	Married	Less than $40K	Blue	4.686581	-12.751914
4396	-0.559362	0.517549	0.015870	0.255631	0.516027	-1.448786	0.104226	0.581243	0.046829	-1.242119	...	0.282402	-0.256918	F	High School	Married	Less than $40K	Blue	32.297958	0.200756

	CLIENTNUM	Attrition_Flag	Customer_Age	Gender	Dependent_count	Education_Level	Marital_Status	Income_Category	Card_Category	Months_on_book	...	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1	Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0	768805383	Existing Customer	45	M	3	High School	Married	$60K - $80K	Blue	39	...	12691.0	777	11914.0	1.335	1144	42	1.625	0.061	0.000093	0.99991
1	818770008	Existing Customer	49	F	5	Graduate	Single	Less than $40K	Blue	44	...	8256.0	864	7392.0	1.541	1291	33	3.714	0.105	0.000057	0.99994
2	713982108	Existing Customer	51	M	3	Graduate	Married	$80K - $120K	Blue	36	...	3418.0	0	3418.0	2.594	1887	20	2.333	0.000	0.000021	0.99998
3	769911858	Existing Customer	40	F	4	High School	Unknown	Less than $40K	Blue	34	...	3313.0	2517	796.0	1.405	1171	20	2.333	0.760	0.000134	0.99987
4	709106358	Existing Customer	40	M	3	Uneducated	Married	$60K - $80K	Blue	21	...	4716.0	0	4716.0	2.175	816	28	2.500	0.000	0.000022	0.99998

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
1237	38	3	18	6	3	2	2193.0	1260	933.0	0.886	1799	47	0.567	0.575
1201	57	1	47	4	1	3	3416.0	2468	948.0	0.598	1536	42	0.556	0.722
6921	41	4	30	5	2	2	9139.0	1620	7519.0	0.773	4675	78	1.108	0.177
6133	42	4	36	6	2	1	1438.3	0	1438.3	0.795	4396	70	0.628	0.000
4396	42	3	36	4	3	1	9351.0	1482	7869.0	0.464	5099	60	0.714	0.158

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	...	Avg_Utilization_Ratio	Gender	Education_Level	Marital_Status	Income_Category	Card_Category	x_projected	y_projected	predicted_cluster	silhoutte_scores
2696	0.085622	0.517549	-0.499710	0.255631	0.516027	-0.574377	-0.620460	-0.015464	-0.619004	-0.725187	...	0.384303	F	Uneducated	Married	Less than $40K	Blue	40.694653	-16.122297	0	0.420295
5837	-0.688358	0.517549	0.144765	-0.378492	-1.487960	-0.574377	-0.773909	-0.224480	-0.751826	-0.418687	...	1.082840	F	College	Married	Less than $40K	Blue	43.299656	-9.447454	0	0.419305
6229	-0.172372	2.049607	0.015870	0.255631	-1.487960	-0.574377	-0.679500	0.081178	-0.687596	-1.059134	...	0.799843	F	Graduate	Married	Less than $40K	Blue	40.632614	-5.328576	0	0.414798
6783	0.472611	-0.248481	0.918135	-0.378492	0.516027	-0.574377	-0.773909	-0.061538	-0.767920	-0.345493	...	1.444647	F	Unknown	Married	Less than $40K	Blue	51.003998	-8.726282	0	0.414422
5664	-0.043375	0.517549	0.015870	0.889755	-0.485967	-0.574377	-0.773909	0.280080	-0.801661	0.217186	...	2.200500	F	College	Married	Less than $40K	Blue	42.324551	-10.215382	0	0.413030
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5164	0.601608	-0.248481	0.015870	-0.378492	3.522007	0.300031	1.967655	-1.084144	2.074961	-0.386664	...	-0.822913	M	Graduate	Married	$60K - $80K	Silver	-63.441071	16.660053	0	-0.304298
8155	0.343615	0.517549	0.144765	-1.646740	-1.487960	-1.448786	2.896984	0.394702	2.858333	-0.761784	...	-0.686788	M	Graduate	Married	$120K +	Blue	-55.980801	8.018754	0	-0.308138
4090	-0.172372	0.517549	0.015870	-0.378492	-1.487960	-1.448786	-0.748972	0.165459	-0.765401	-0.674866	...	1.573608	F	High School	Single	$40K - $60K	Blue	5.391948	12.250793	1	-0.308980
123	0.085622	0.517549	0.660345	0.255631	0.516027	0.300031	-0.513144	0.646420	-0.577049	-0.775508	...	0.635060	F	High School	Single	$40K - $60K	Blue	6.377181	11.181173	1	-0.316340
5445	-1.204345	-1.014510	-1.144185	0.255631	-0.485967	-0.574377	-0.734545	-0.194139	-0.715455	-0.057291	...	0.760439	F	High School	Single	$40K - $60K	Blue	5.550048	9.193234	1	-0.319474

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio	Gender	Education_Level	Marital_Status	Income_Category	Card_Category
0	0.0010891548145757226	-0.003828747625808713	-0.004336322901852015	0.06422083050091101	-0.009849195546806175	-0.020953981245417797	-0.39415468974810103	-0.013402635541569534	-0.3928763683323007	0.009038376377362404	-0.09001427737062087	-0.042150822477012566	-0.007247537012745263	0.18487639997164096	F	Graduate	Married	Less than $40K	Blue
1	-0.0019521151737683048	0.006862345221127362	0.0077720831068100234	-0.11510435065346653	0.017652921163300077	0.03755626306988365	0.7064517737719015	0.024021826703650023	0.7041606113041412	-0.016199672843894047	0.1613344901564182	0.07554780922145152	0.012989913633664599	-0.3313578756908844	M	High School	Single	$80K - $120K	Blue

	Customer_Age	Dependent_count	Months_on_book	Total_Relationship_Count	Months_Inactive_12_mon	Contacts_Count_12_mon	Credit_Limit	Total_Revolving_Bal	Avg_Open_To_Buy	Total_Amt_Chng_Q4_Q1	Total_Trans_Amt	Total_Trans_Ct	Total_Ct_Chng_Q4_Q1	Avg_Utilization_Ratio
0	46.344693	2.319377	35.843233	3.698150	2.475170	2.632911	4860.191334	952.834469	3907.356865	0.737500	3594.020448	55.78481	0.643027	0.281329
1	46.321117	2.333333	35.937173	3.415358	2.502618	2.699825	14777.542757	986.137871	13791.404887	0.731983	4369.696335	58.43630	0.647986	0.137220

Blue Cluster	Definition
0	High transaction counts and amounts, low number of products, low credit limit
1	Low revolving balance, low credit limit
3	High utilization ratio, low credit limit
4	Low transaction counts and amounts, high credit limit

Non-blue Cluster	Definition
2	High transaction counts and amounts, lower credit limit
5	Low transaction counts and amounts, high credit limit		Non-Blue Cluster	Income Levels

Blue Cluster	Income Levels
0	Mixed income levels
1	Skewed to lower incomes
3	Heavily skewed to lower incomes
4	Heavily skewed to higher incomes

Card_Category	Blue	Gold	Platinum	Silver
compl_clusters_6
0	96.79	2.95	0.27	0.00
1	100.00	0.00	0.00	0.00
2	0.00	5.23	0.58	94.19
3	100.00	0.00	0.00	0.00
4	100.00	0.00	0.00	0.00
5	0.00	23.53	4.95	71.52

STAT 430 - Final Project¶

Market Segmentation of Credit Card Customers¶

1. Introduction and Dataset Research¶

Motivation & Research Question¶

2. Data Cleaning and Data Manipulation¶

Import¶

3. Basic Descriptive Analytics¶

Numerical Attributes' Summary Statistics¶

Categorical Attributes' Value Count¶

4. Random Sample & Data Scaling Decisions (Only applies to Research Question 2)¶

Random Sample¶

Do we need to scale the dataset...?¶

5. Clusterability and Clustering Structure¶

Research Question 1¶

Research Question 2¶

Hopkin's Statistic¶

T-SNE Algorithm¶

6. Algorithm Selection Motivation¶

Research Question 1¶

Research Question 2¶

7. K-Prototype Algorithm (Research Question 2 only)¶

7.1 Parameter Selection¶

$\gamma = 0.5$¶

$\gamma = 1$¶

$\gamma = 5$¶

$\gamma = 100$¶

Final Selection - Clustering with k = 2 and $\gamma = 100$¶

7.2 Cohesion and Separation¶

Noticeable Observations¶

7.3 Post K-Prototype Clustering Analysis¶

7.4 Visualization¶

7.5 Supervised Clustering Evaluation Metrics - Homogeneity / Completeness / V-Score¶

Adjusted Rand Index¶

8. Hierarchical Agglomerative Clustering¶

Research Question 1¶

Research Question 2¶

8.1 HAC - Single¶

8.2 HAC - Complete Linkage¶

8.3 HAC - Average Linkage¶

8.4 Visualization¶

8.5 Supervised Clustering Evaluation Metrics - Homogeneity / Completeness / V-Score¶

9. Conclusion¶

Algorithms Comparison (Research Question 2)¶

Algorithms Performances¶

Algorithms Results¶

Insights¶

10. Group Contributions¶

References¶