import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline


#html export
import plotly.io as pio
pio.renderers.default = 'notebook'

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df

# make column names and values uniform
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = df.dtypes[df.dtypes == 'object'].index
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.head().T

df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

# converting total charges to numeric
tc = pd.to_numeric(df.totalcharges, errors='coerce') # blanks = NaN

df[tc.isnull()][['customerid', 'totalcharges']]

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce') # blanks = NaN

df.totalcharges = df.totalcharges.fillna(0)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   seniorcitizen     7043 non-null   int64  
 3   partner           7043 non-null   object 
 4   dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   phoneservice      7043 non-null   object 
 7   multiplelines     7043 non-null   object 
 8   internetservice   7043 non-null   object 
 9   onlinesecurity    7043 non-null   object 
 10  onlinebackup      7043 non-null   object 
 11  deviceprotection  7043 non-null   object 
 12  techsupport       7043 non-null   object 
 13  streamingtv       7043 non-null   object 
 14  streamingmovies   7043 non-null   object 
 15  contract          7043 non-null   object 
 16  paperlessbilling  7043 non-null   object 
 17  paymentmethod     7043 non-null   object 
 18  monthlycharges    7043 non-null   float64
 19  totalcharges      7043 non-null   float64
 20  churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB

df['churn'].unique()

array(['no', 'yes'], dtype=object)

df['churn'] = (df.churn == 'yes').astype(int)

df['churn'].unique()

array([0, 1])

from sklearn.model_selection import train_test_split

train_test_split?

Signature:
train_test_split(
    *arrays,
    test_size=None,
    train_size=None,
    random_state=None,
    shuffle=True,
    stratify=None,
)
Docstring:
Split arrays or matrices into random train and test subsets.

Quick utility that wraps input validation,
``next(ShuffleSplit().split(X, y))``, and application to input data
into a single call for splitting (and optionally subsampling) data into a
one-liner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float or int, default=None
    If float, should be between 0.0 and 1.0 and represent the proportion
    of the dataset to include in the test split. If int, represents the
    absolute number of test samples. If None, the value is set to the
    complement of the train size. If ``train_size`` is also None, it will
    be set to 0.25.

train_size : float or int, default=None
    If float, should be between 0.0 and 1.0 and represent the
    proportion of the dataset to include in the train split. If
    int, represents the absolute number of train samples. If None,
    the value is automatically set to the complement of the test size.

random_state : int, RandomState instance or None, default=None
    Controls the shuffling applied to the data before applying the split.
    Pass an int for reproducible output across multiple function calls.
    See :term:`Glossary <random_state>`.

shuffle : bool, default=True
    Whether or not to shuffle the data before splitting. If shuffle=False
    then stratify must be None.

stratify : array-like, default=None
    If not None, data is split in a stratified fashion, using this as
    the class labels.
    Read more in the :ref:`User Guide <stratification>`.

Returns
-------
splitting : list, length=2 * len(arrays)
    List containing train-test split of inputs.

    .. versionadded:: 0.16
        If the input is sparse, the output will be a
        ``scipy.sparse.csr_matrix``. Else, output type is the same as the
        input type.

Examples
--------
>>> import numpy as np
>>> from sklearn.model_selection import train_test_split
>>> X, y = np.arange(10).reshape((5, 2)), range(5)
>>> X
array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])
>>> list(y)
[0, 1, 2, 3, 4]

>>> X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)
...
>>> X_train
array([[4, 5],
       [0, 1],
       [6, 7]])
>>> y_train
[2, 0, 3]
>>> X_test
array([[2, 3],
       [8, 9]])
>>> y_test
[1, 4]

>>> train_test_split(y, shuffle=False)
[[0, 1, 2], [3, 4]]
File:      c:\program files\python312\lib\site-packages\sklearn\model_selection\_split.py
Type:      function

# splitting the data into 80% training, 20% testing
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

len(df_full_train), len(df_test)

(5634, 1409)

# splitting full train to train and validation
# validation == 20% of data(25% of full train[20/80])
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

# resetting the index with new shuffled dataframe
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train

y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

# delete churn variable to avoid accidentally using it
del df_train['churn']
del df_val['churn']
del df_test['churn']

df_full_train = df_full_train.reset_index(drop=True)
df_full_train

df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

# looking at the distribution of our target variable
# How many users are churning
df_full_train['churn'].value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

# checking churn rate(percentage)
df_full_train['churn'].value_counts(normalize=True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

0.27

# specifying variables
numerical = ['tenure', 'monthlycharges', 'totalcharges']
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
        'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

df_full_train.head()

churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female

0.27682403433476394

churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male

0.2632135306553911

global_churn_rate = df_full_train.churn.mean()
global_churn_rate

0.26996805111821087

df_full_train.partner.value_counts()

partner
no     2932
yes    2702
Name: count, dtype: int64

churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_partner

0.20503330866025166

churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner

0.3298090040927694

churn_no_partner / global_churn_rate

1.2216593879412643

churn_partner / global_churn_rate

0.7594724924338315

df_group = df_full_train.groupby('gender').churn.agg(['count', 'mean'])
df_group['diff'] = df_group['mean'] - global_churn_rate
df_group['risk'] = df_group['mean'] / global_churn_rate
df_group

# function for displaying various notebooks
from IPython.display import display

for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['count', 'mean'])
    df_group['diff'] = df_group['mean'] - global_churn_rate
    df_group['risk'] = df_group['mean'] / global_churn_rate
    display(df_group)
    print()
    print()

gender


seniorcitizen


partner


dependents


phoneservice


multiplelines


internetservice


onlinesecurity


onlinebackup


deviceprotection

from sklearn.metrics import mutual_info_score

mutual_info_score(df_full_train.churn, df_full_train.contract)

0.0983203874041556

mutual_info_score(df_full_train.churn, df_full_train.gender)

0.0001174846211139946

# function for getting mutual info score
def mutual_info_churn_score(series):
    return mutual_info_score(df_full_train.churn, series)

# applying the function to get mutual info score for each column
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

# if we dont care about direction
df_full_train[numerical].corrwith(df_full_train.churn).abs()

tenure            0.351885
monthlycharges    0.196805
totalcharges      0.196353
dtype: float64

from sklearn.feature_extraction import DictVectorizer

df_train[['gender', 'contract']].iloc[:25]

# to convert each row to a dictionary
dicts = df_train[['gender', 'contract']].iloc[:25].to_dict(orient='records')
dicts

[{'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'one_year'},
 {'gender': 'male', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'one_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'}]

# cr8 a new instance
dv = DictVectorizer(sparse=False)

# training our dict vectorizer - to teach the vectorizer what kind of values there are
dv.fit(dicts)

DictVectorizer(sparse=False)

DictVectorizer(sparse=False)

dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'gender=female', 'gender=male'], dtype=object)

dv.transform(dicts)

array([[0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.]])

# dictvectorizer is smart enough to leave numerical values as is
dicts = df_train[['gender', 'contract', 'tenure']].iloc[:10].to_dict(orient='records')

dv = DictVectorizer(sparse=False) # new instance

dv.fit(dicts) # training it to learn our data

dv.transform(dicts) # transforming to a matrix

array([[ 0.,  1.,  1.,  0., 72.],
       [ 1.,  0.,  0.,  1., 10.],
       [ 1.,  0.,  1.,  0.,  5.],
       [ 1.,  0.,  1.,  0.,  5.],
       [ 0.,  1.,  1.,  0., 18.],
       [ 1.,  0.,  0.,  1.,  4.],
       [ 1.,  0.,  0.,  1.,  1.],
       [ 1.,  0.,  1.,  0.,  1.],
       [ 0.,  1.,  1.,  0., 72.],
       [ 1.,  0.,  1.,  0.,  6.]])

# performing this on our training data
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
train_dicts[0]

{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 72,
 'monthlycharges': 115.5,
 'totalcharges': 8425.15}

dv = DictVectorizer(sparse=False) # new instance

dv.fit(train_dicts) # training it to learn our data

dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',
       'streamingmovies=yes', 'streamingtv=no',
       'streamingtv=no_internet_service', 'streamingtv=yes',
       'techsupport=no', 'techsupport=no_internet_service',
       'techsupport=yes', 'tenure', 'totalcharges'], dtype=object)

X_train = dv.fit_transform(train_dicts)
X_train.shape

(4225, 45)

# Encoding categorical features for validation dataset
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

X_val = dv.transform(val_dicts)

def sigmoid(z):
    return 1/(1 + np.exp(-z))

z = np.linspace(-7, 7, 51)
z

array([-7.0000000e+00, -6.7200000e+00, -6.4400000e+00, -6.1600000e+00,
       -5.8800000e+00, -5.6000000e+00, -5.3200000e+00, -5.0400000e+00,
       -4.7600000e+00, -4.4800000e+00, -4.2000000e+00, -3.9200000e+00,
       -3.6400000e+00, -3.3600000e+00, -3.0800000e+00, -2.8000000e+00,
       -2.5200000e+00, -2.2400000e+00, -1.9600000e+00, -1.6800000e+00,
       -1.4000000e+00, -1.1200000e+00, -8.4000000e-01, -5.6000000e-01,
       -2.8000000e-01,  8.8817842e-16,  2.8000000e-01,  5.6000000e-01,
        8.4000000e-01,  1.1200000e+00,  1.4000000e+00,  1.6800000e+00,
        1.9600000e+00,  2.2400000e+00,  2.5200000e+00,  2.8000000e+00,
        3.0800000e+00,  3.3600000e+00,  3.6400000e+00,  3.9200000e+00,
        4.2000000e+00,  4.4800000e+00,  4.7600000e+00,  5.0400000e+00,
        5.3200000e+00,  5.6000000e+00,  5.8800000e+00,  6.1600000e+00,
        6.4400000e+00,  6.7200000e+00,  7.0000000e+00])

sigmoid(z)

array([9.11051194e-04, 1.20508423e-03, 1.59386223e-03, 2.10780106e-03,
       2.78699622e-03, 3.68423990e-03, 4.86893124e-03, 6.43210847e-03,
       8.49286285e-03, 1.12064063e-02, 1.47740317e-02, 1.94550846e-02,
       2.55807883e-02, 3.35692233e-02, 4.39398154e-02, 5.73241759e-02,
       7.44679452e-02, 9.62155417e-02, 1.23467048e-01, 1.57095469e-01,
       1.97816111e-01, 2.46011284e-01, 3.01534784e-01, 3.63547460e-01,
       4.30453776e-01, 5.00000000e-01, 5.69546224e-01, 6.36452540e-01,
       6.98465216e-01, 7.53988716e-01, 8.02183889e-01, 8.42904531e-01,
       8.76532952e-01, 9.03784458e-01, 9.25532055e-01, 9.42675824e-01,
       9.56060185e-01, 9.66430777e-01, 9.74419212e-01, 9.80544915e-01,
       9.85225968e-01, 9.88793594e-01, 9.91507137e-01, 9.93567892e-01,
       9.95131069e-01, 9.96315760e-01, 9.97213004e-01, 9.97892199e-01,
       9.98406138e-01, 9.98794916e-01, 9.99088949e-01])

px.line(x=z, y=sigmoid(z))

def linear_regression(xi):
    result = w0

    for j in range(len(w)):
        result = result + xi[j] * w[j]

    return result

def logistic_regression(xi):
    score = w0

    for j in range(len(w)):
        score = result + xi[j] * w[j]
        result = sigmoid(score)

    return result

from sklearn.linear_model import LogisticRegression

# training the model
model = LogisticRegression()
model.fit(X_train, y_train)

c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

LogisticRegression()

LogisticRegression()

model.predict(X_train) # hard predictions

array([0, 1, 1, ..., 1, 0, 1])

# getting probability scores
model.predict_proba(X_train)  # divided into prob of 0,1

array([[0.90456745, 0.09543255],
       [0.32070143, 0.67929857],
       [0.36636832, 0.63363168],
       ...,
       [0.46807615, 0.53192385],
       [0.95743125, 0.04256875],
       [0.30126044, 0.69873956]])

# getting probability of churning using val data set
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.00900304, 0.20470486, 0.21236512, ..., 0.13643964, 0.79979009,
       0.83743433])

churn_decision = (y_pred >= 0.5)
churn_decision

array([False, False, False, ..., False,  True,  True])

churn_decision.astype(int)

array([0, 0, 0, ..., 0, 1, 1])

(y_val == churn_decision).mean()

0.8034066713981547

df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val

df_pred

# weights for each feature
model.coef_[0].round(3)

array([ 0.475, -0.175, -0.408, -0.03 , -0.078,  0.063, -0.089, -0.081,
       -0.034, -0.073, -0.335,  0.317, -0.089,  0.004, -0.258,  0.142,
        0.009,  0.063, -0.089, -0.081,  0.266, -0.089, -0.284, -0.231,
        0.123, -0.166,  0.059, -0.087, -0.032,  0.07 , -0.059,  0.142,
       -0.249,  0.216, -0.12 , -0.089,  0.102, -0.071, -0.089,  0.052,
        0.213, -0.089, -0.232, -0.07 ,  0.   ])

dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',
       'streamingmovies=yes', 'streamingtv=no',
       'streamingtv=no_internet_service', 'streamingtv=yes',
       'techsupport=no', 'techsupport=no_internet_service',
       'techsupport=yes', 'tenure', 'totalcharges'], dtype=object)

# zipping each feature with its own weight to a dict
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': 0.475,
 'contract=one_year': -0.175,
 'contract=two_year': -0.408,
 'dependents=no': -0.03,
 'dependents=yes': -0.078,
 'deviceprotection=no': 0.063,
 'deviceprotection=no_internet_service': -0.089,
 'deviceprotection=yes': -0.081,
 'gender=female': -0.034,
 'gender=male': -0.073,
 'internetservice=dsl': -0.335,
 'internetservice=fiber_optic': 0.317,
 'internetservice=no': -0.089,
 'monthlycharges': 0.004,
 'multiplelines=no': -0.258,
 'multiplelines=no_phone_service': 0.142,
 'multiplelines=yes': 0.009,
 'onlinebackup=no': 0.063,
 'onlinebackup=no_internet_service': -0.089,
 'onlinebackup=yes': -0.081,
 'onlinesecurity=no': 0.266,
 'onlinesecurity=no_internet_service': -0.089,
 'onlinesecurity=yes': -0.284,
 'paperlessbilling=no': -0.231,
 'paperlessbilling=yes': 0.123,
 'partner=no': -0.166,
 'partner=yes': 0.059,
 'paymentmethod=bank_transfer_(automatic)': -0.087,
 'paymentmethod=credit_card_(automatic)': -0.032,
 'paymentmethod=electronic_check': 0.07,
 'paymentmethod=mailed_check': -0.059,
 'phoneservice=no': 0.142,
 'phoneservice=yes': -0.249,
 'seniorcitizen': 0.216,
 'streamingmovies=no': -0.12,
 'streamingmovies=no_internet_service': -0.089,
 'streamingmovies=yes': 0.102,
 'streamingtv=no': -0.071,
 'streamingtv=no_internet_service': -0.089,
 'streamingtv=yes': 0.052,
 'techsupport=no': 0.213,
 'techsupport=no_internet_service': -0.089,
 'techsupport=yes': -0.232,
 'tenure': -0.07,
 'totalcharges': 0.0}

df_train

# subset of features for smaller model
small = ['contract', 'tenure', 'monthlycharges']

df_train[small].iloc[:10].to_dict(orient='records')

[{'contract': 'two_year', 'tenure': 72, 'monthlycharges': 115.5},
 {'contract': 'month-to-month', 'tenure': 10, 'monthlycharges': 95.25},
 {'contract': 'month-to-month', 'tenure': 5, 'monthlycharges': 75.55},
 {'contract': 'month-to-month', 'tenure': 5, 'monthlycharges': 80.85},
 {'contract': 'two_year', 'tenure': 18, 'monthlycharges': 20.1},
 {'contract': 'month-to-month', 'tenure': 4, 'monthlycharges': 30.5},
 {'contract': 'month-to-month', 'tenure': 1, 'monthlycharges': 75.1},
 {'contract': 'month-to-month', 'tenure': 1, 'monthlycharges': 70.3},
 {'contract': 'two_year', 'tenure': 72, 'monthlycharges': 19.75},
 {'contract': 'month-to-month', 'tenure': 6, 'monthlycharges': 109.9}]

dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

# vectorizing our small dataset
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

DictVectorizer(sparse=False)

DictVectorizer(sparse=False)

dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

X_train_small = dv_small.transform(dicts_train_small)

# training our small model
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

LogisticRegression()

LogisticRegression()

# our bias term
w0 = model_small.intercept_[0]
w0

-2.477957596325319

# our weights
w = model_small.coef_[0]
w.round(3)

array([ 0.971, -0.024, -0.948,  0.027, -0.036])

# zipping each feature with its own weight to a dict
dict(zip(dv_small.get_feature_names_out(), w.round(3)))

{'contract=month-to-month': 0.971,
 'contract=one_year': -0.024,
 'contract=two_year': -0.948,
 'monthlycharges': 0.027,
 'tenure': -0.036}

# example 
-2.47 + 0.97 + (60 * 0.027) + (1 * -0.036)

0.08399999999999966

# getting the prediction probability
sigmoid(_) # last output taken as parameter

0.5209876607065322

df_full_train

# getting our dictionaries
dicts_full_train = df_full_train[numerical + categorical].to_dict(orient='records')
dicts_full_train[0]

{'tenure': 12,
 'monthlycharges': 19.7,
 'totalcharges': 258.35,
 'gender': 'male',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'no',
 'onlinesecurity': 'no_internet_service',
 'onlinebackup': 'no_internet_service',
 'deviceprotection': 'no_internet_service',
 'techsupport': 'no_internet_service',
 'streamingtv': 'no_internet_service',
 'streamingmovies': 'no_internet_service',
 'contract': 'two_year',
 'paperlessbilling': 'no',
 'paymentmethod': 'mailed_check'}

# getting our feature matrix with DictVectorizer
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

y_full_train = df_full_train.churn.values

# training our model
model = LogisticRegression()
model.fit(X_full_train, y_full_train)

c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

LogisticRegression()

LogisticRegression()

# testing
# getting our dictionaries
dicts_test = df_test[numerical + categorical].to_dict(orient='records')

# getting our feature matrix with DictVectorizer
X_test = dv.transform(dicts_test)

y_pred = model.predict_proba(X_test)[:, 1]

churn_decision = (y_pred > 0.5)

# accuracy
(churn_decision == y_test).mean()

0.8126330731014905

customer = dicts_test[10]
customer

{'tenure': 32,
 'monthlycharges': 93.95,
 'totalcharges': 2861.45,
 'gender': 'male',
 'seniorcitizen': 1,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'mailed_check'}

x_small = dv.transform([customer])
x_small.shape

(1, 45)

# predicted probability of churning
model.predict_proba(x_small)[0, 1]

0.4946473097005084

# actual value of churning
y_test[10]

0

# total
len(y_val)

1409

# correct
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

y_pred = model.predict_proba(X_val)[:, 1]
churn_decision = (y_pred >= 0.5)

(y_val == churn_decision).sum()

1135

# accuracy
(churn_decision == y_val).sum() / len(y_test)

0.8055358410220014

thresholds = np.linspace(0, 1, 21)
thresholds

array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])

scores = []

for t in thresholds:
    churn_decision = (y_pred >= t)
    score = (y_val == churn_decision).mean()
    scores.append(score)
    print(f"Thresh: '{t:.2f}'  Score: '{score:.3f}'")

Thresh: '0.00'  Score: '0.274'
Thresh: '0.05'  Score: '0.507'
Thresh: '0.10'  Score: '0.600'
Thresh: '0.15'  Score: '0.667'
Thresh: '0.20'  Score: '0.717'
Thresh: '0.25'  Score: '0.747'
Thresh: '0.30'  Score: '0.767'
Thresh: '0.35'  Score: '0.776'
Thresh: '0.40'  Score: '0.793'
Thresh: '0.45'  Score: '0.796'
Thresh: '0.50'  Score: '0.806'
Thresh: '0.55'  Score: '0.814'
Thresh: '0.60'  Score: '0.806'
Thresh: '0.65'  Score: '0.789'
Thresh: '0.70'  Score: '0.781'
Thresh: '0.75'  Score: '0.749'
Thresh: '0.80'  Score: '0.738'
Thresh: '0.85'  Score: '0.727'
Thresh: '0.90'  Score: '0.726'
Thresh: '0.95'  Score: '0.726'
Thresh: '1.00'  Score: '0.726'

px.line(x=thresholds, y=scores)

# accuracy with scikit learn
from sklearn.metrics import accuracy_score

accuracy_score(y_val, y_pred >= 0.55) # val vs pred

0.8140525195173882

from collections import Counter
Counter(y_val)

Counter({0: 1023, 1: 386})

# using our data
actual_positive = (y_val == 1)
actual_negative = (y_val == 0)

# using ourr predictions
t = 0.55
predict_positive = (y_pred >= t)
predict_negative = (y_pred < t)

predict_positive

array([False, False, False, ..., False,  True,  True])

true_positive = (predict_positive & actual_positive).sum()
true_negative = (predict_negative & actual_negative).sum()

false_positive = (predict_positive & actual_negative).sum()
false_negative = (predict_negative & actual_positive).sum()

confusion_matrix = np.array([
    [true_negative, false_positive],
    [false_negative, true_positive]
])
confusion_matrix

array([[949,  74],
       [188, 198]])

# Create labels for the axes
labels = ['Predicted No Churn (0)', 'Predicted Churn (1)']
ticks = ['Actual No Churn (0)', 'Actual Churn (1)']

# Plot the heatmap
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
ax = sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', square=True,
                 xticklabels=labels, yticklabels=ticks)

ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix Heatmap')
plt.show()

(confusion_matrix / confusion_matrix.sum()).round(2)

array([[0.67, 0.05],
       [0.13, 0.14]])

p = true_positive / (true_positive + false_positive)
p

0.7279411764705882

r = true_positive / (true_positive + false_negative)
r

0.5129533678756477

tpr = true_positive / (true_positive + false_negative)
tpr

0.5129533678756477

fpr = false_positive / (false_positive + true_negative)
fpr

0.07233626588465299

# evaluationg different thresholds
scores = []

thresholds = np.linspace(0, 1, 101)

for t in thresholds:
    actual_positive = (y_val == 1)
    actual_negative = (y_val == 0)

    predict_positive = (y_pred >= t)
    predict_negative = (y_pred < t)

    true_positive = (predict_positive & actual_positive).sum()
    true_negative = (predict_negative & actual_negative).sum()

    false_positive = (predict_positive & actual_negative).sum()
    false_negative = (predict_negative & actual_positive).sum()


    scores.append((t, true_positive, false_positive, true_negative, false_negative))

columns = ['threshold', 'true_positive', 'false_positive', 'true_negative', 'false_negative']
df_scores = pd.DataFrame(scores ,columns=columns)
df_scores[::10] # every record to last with increments of 10

df_scores['tpr'] = df_scores.true_positive / (df_scores.true_positive + df_scores.false_negative)
df_scores['fpr'] = df_scores.false_positive / (df_scores.false_positive + df_scores.true_negative)
df_scores['diff'] = df_scores['tpr'] - df_scores['fpr']
df_scores[::10]

# plot
fig = px.line(
    df_scores,
    x='threshold',
    y=['tpr', 'fpr'],  # Specify both y-values
    labels={'value': 'Rate', 'threshold': 'Threshold'},  # Rename axes
    title="TPR and FPR vs Threshold",
)

fig.show()

# random predictions
np.random.seed(1)
y_rand = np.random.uniform(0, 1, size=len(y_val))
y_rand.round(3)

array([0.417, 0.72 , 0.   , ..., 0.774, 0.334, 0.089])

# accuracy for our random model
((y_rand >= 0.5) == y_val).mean()

0.5017743080198722

# function for generating df with tpr, fpr
def tpr_fpr_dataframe(y_val, y_pred):
    scores = []

    thresholds = np.linspace(0, 1, 101)

    for t in thresholds:
        actual_positive = (y_val == 1)
        actual_negative = (y_val == 0)

        predict_positive = (y_pred >= t)
        predict_negative = (y_pred < t)

        true_positive = (predict_positive & actual_positive).sum()
        true_negative = (predict_negative & actual_negative).sum()

        false_positive = (predict_positive & actual_negative).sum()
        false_negative = (predict_negative & actual_positive).sum()


        scores.append((t, true_positive, false_positive, true_negative, false_negative))
    
    columns = ['threshold', 'true_positive', 'false_positive', 'true_negative', 'false_negative']
    df_scores = pd.DataFrame(scores ,columns=columns)

    df_scores['tpr'] = df_scores.true_positive / (df_scores.true_positive + df_scores.false_negative)
    df_scores['fpr'] = df_scores.false_positive / (df_scores.false_positive + df_scores.true_negative)
    
    return df_scores

df_rand = tpr_fpr_dataframe(y_val, y_rand)
df_rand[::10]

# plot
fig = px.line(
    df_rand,
    x='threshold',
    y=['tpr', 'fpr'],  # Specify both y-values
    labels={'value': 'Rate', 'threshold': 'Threshold'},  # Rename axes
    title="TPR and FPR vs Threshold",
)

fig.show()

num_neg = (y_val == 0).sum()
num_pos = (y_val == 1).sum()
num_neg, num_pos

(1023, 386)

# creating ordered values of non churning and churning to be our ideal y_val
y_ideal = np.repeat([0, 1], [num_neg, num_pos])
y_ideal

array([0, 0, 0, ..., 1, 1, 1])

# finding the ideal threshold for churning
num_neg / (num_pos + num_neg)

0.7260468417317246

# comparing our ideal model
y_ideal_pred = np.linspace(0, 1, len(y_val))
((y_ideal_pred >= 0.726) == y_ideal).mean()

1.0

df_ideal = tpr_fpr_dataframe(y_ideal, y_ideal_pred)

# plot
fig = px.line(
    df_ideal,
    x='threshold',
    y=['tpr', 'fpr'],  # Specify both y-values
    labels={'value': 'Rate', 'threshold': 'Threshold'},  # Rename axes
    title="TPR and FPR vs Threshold",
)

fig.show()

plt.plot(df_scores.threshold, df_scores['tpr'], label='TPR')
plt.plot(df_scores.threshold, df_scores['fpr'], label='FPR')

plt.plot(df_ideal.threshold, df_ideal['tpr'], label='TPR ideal', color='black')
plt.plot(df_ideal.threshold, df_ideal['fpr'], label='FPR ideal', color='black')

# plt.plot(df_rand.threshold, df_rand['tpr'], label='TPR random', color='grey')
# plt.plot(df_rand.threshold, df_rand['fpr'], label='FPR random', color='grey')

plt.legend()

<matplotlib.legend.Legend at 0x1278ab4e3c0>

# ROC Curve
plt.figure(figsize=(6, 6))# roc curves are square

plt.plot(df_scores.fpr, df_scores.tpr, label='model')
plt.plot(df_rand.fpr, df_rand.tpr, label='random')
plt.plot(df_ideal.fpr, df_ideal.tpr, label='ideal')

plt.xlabel('FPR')
plt.ylabel('TPR')


plt.legend()

<matplotlib.legend.Legend at 0x1278abadc10>

# plotting ROC curves with scikit-learn
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_val, y_pred)

plt.figure(figsize=(6, 6))# roc curves are square

plt.plot(fpr, tpr, label='Model')
plt.plot([0, 1], [0, 1],  label='Random', linestyle='dashed')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

<matplotlib.legend.Legend at 0x1278ad9dc10>

from sklearn.metrics import auc

# auc of our model
auc(fpr, tpr)

0.851873236797188

# auc of ideal model
auc(df_ideal.fpr, df_ideal.tpr)

0.9999430203759136

# AUC directly from ROC
from sklearn.metrics import roc_auc_score

roc_auc_score(y_val, y_pred)

0.851873236797188

# function for training our model
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000) # C = regularization parameter(smaller is better)
    model.fit(X_train, y_train)

    return dv, model

dv, model = train(df_train, y_train, C=1.0)

c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

# function for predicting our target variable
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

y_pred = predict(df_val, dv, model)

# k-fold cross-validation
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, shuffle=True, random_state=1) # repeatedly splitting our data into 90% training and 10% validating

# indexes for train and validation 
train_idx, val_idx = next(kfold.split(df_full_train))

# to see how long each iteration takes
from tqdm.auto import tqdm

# looping over iterations
scores = []

for train_idx, val_idx in tqdm(kfold.split(df_full_train)):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

0it [00:00, ?it/s]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

1it [00:03,  3.17s/it]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

2it [00:05,  2.48s/it]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

3it [00:07,  2.42s/it]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

4it [00:09,  2.27s/it]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

5it [00:11,  2.22s/it]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

6it [00:13,  2.20s/it]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

7it [00:15,  2.16s/it]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

8it [00:18,  2.17s/it]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

9it [00:20,  2.13s/it]c:\Program Files\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

10it [00:22,  2.30s/it]

scores

[0.8498640332045226,
 0.8426051980198019,
 0.8584123714765046,
 0.8333717389515266,
 0.8247983870967741,
 0.8414085914085915,
 0.8430804298274177,
 0.825466476913346,
 0.8460204535349045,
 0.8612987798729453]

print(f"Mean: '{np.mean(scores):.3f}'  Std Dev: +- '{np.std(scores):.3f}'")

Mean: '0.843'  Std Dev: +- '0.012'

# using the model
dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
auc

0.8583300833089581

	customerID	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	0	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	0	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	0	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	0	No	No	45	No	No phone service	DSL	Yes	...	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	0	No	No	2	Yes	No	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7038	6840-RESVB	Male	0	Yes	Yes	24	Yes	Yes	DSL	Yes	...	Yes	Yes	Yes	Yes	One year	Yes	Mailed check	84.80	1990.5	No
7039	2234-XADUH	Female	0	Yes	Yes	72	Yes	Yes	Fiber optic	No	...	Yes	No	Yes	Yes	One year	Yes	Credit card (automatic)	103.20	7362.9	No
7040	4801-JZAZL	Female	0	Yes	Yes	11	No	No phone service	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.60	346.45	No
7041	8361-LTMKD	Male	1	Yes	No	4	Yes	Yes	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Mailed check	74.40	306.6	Yes
7042	3186-AJIEK	Male	0	No	No	66	Yes	No	Fiber optic	Yes	...	Yes	Yes	Yes	Yes	Two year	Yes	Bank transfer (automatic)	105.65	6844.5	No

	customerid	totalcharges
488	4472-lvygi	_
753	3115-czmzd	_
936	5709-lvoeq	_
1082	4367-nuyao	_
1340	1371-dwpaz	_
3331	7644-omvmy	_
3826	3213-vvolg	_
4380	2520-sgtta	_
5218	2923-arzlg	_
6670	4075-wkniu	_
6754	2775-sefee	_

	count	mean	diff	risk
seniorcitizen
0	4722	0.242270	-0.027698	0.897403
1	912	0.413377	0.143409	1.531208

	count	mean	diff	risk
dependents
no	3968	0.313760	0.043792	1.162212
yes	1666	0.165666	-0.104302	0.613651

	count	mean	diff	risk
multiplelines
no	2700	0.257407	-0.012561	0.953474
no_phone_service	547	0.241316	-0.028652	0.893870
yes	2387	0.290742	0.020773	1.076948

Churn Prediction Project¶

Data preparation¶

Setting up validation framework¶

EDA¶

Feature importance: Churn rate and risk ratio¶

Feature importance: Mutual information¶

Feature importance: Correlation¶

One-hot encoding¶

Logistic Regression¶

Training Logistic regression with Scikit-Learn¶

Model interpretation¶

Using the model¶

Evaluation Metrics¶

1. Accuracy and Dummy Model¶

2. Confusion Table¶

3. Precision and Recall¶

4. ROC Curves¶

5. ROC AUC¶

6. Cross-Validation¶

	customerid	gender	seniorcitizen	partner	dependents	tenure	phoneservice	multiplelines	internetservice	onlinesecurity	...	deviceprotection	techsupport	streamingtv	streamingmovies	contract	paperlessbilling	paymentmethod	monthlycharges	totalcharges	churn
0	8015-ihcgw	female	0	yes	yes	72	yes	yes	fiber_optic	yes	...	yes	yes	yes	yes	two_year	yes	electronic_check	115.50	8425.15	0
1	1960-uycnn	male	0	no	no	10	yes	yes	fiber_optic	no	...	yes	no	no	yes	month-to-month	yes	electronic_check	95.25	1021.55	0
2	9250-wypll	female	0	no	no	5	yes	yes	fiber_optic	no	...	no	no	no	no	month-to-month	no	electronic_check	75.55	413.65	1
3	6786-obwqr	female	0	yes	yes	5	yes	no	fiber_optic	no	...	no	no	yes	no	month-to-month	yes	electronic_check	80.85	356.10	0
4	1328-euzhc	female	0	yes	no	18	yes	no	no	no_internet_service	...	no_internet_service	no_internet_service	no_internet_service	no_internet_service	two_year	no	mailed_check	20.10	370.50	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4220	1309-xgfsn	male	1	yes	yes	52	yes	yes	dsl	no	...	yes	no	yes	yes	one_year	yes	electronic_check	80.85	4079.55	0
4221	4819-hjpiw	male	0	no	no	18	no	no_phone_service	dsl	no	...	no	no	no	no	month-to-month	no	mailed_check	25.15	476.80	0
4222	3703-vavcl	male	0	yes	yes	2	yes	no	fiber_optic	no	...	yes	yes	no	yes	month-to-month	no	credit_card_(automatic)	90.00	190.05	1
4223	3812-lrzir	female	0	yes	yes	27	yes	yes	no	no_internet_service	...	no_internet_service	no_internet_service	no_internet_service	no_internet_service	two_year	no	electronic_check	24.50	761.95	0
4224	1704-nrwye	female	1	no	no	9	yes	no	fiber_optic	no	...	no	no	yes	no	month-to-month	yes	electronic_check	80.85	751.65	1

	customerid	gender	seniorcitizen	partner	dependents	tenure	phoneservice	multiplelines	internetservice	onlinesecurity	...	deviceprotection	techsupport	streamingtv	streamingmovies	contract	paperlessbilling	paymentmethod	monthlycharges	totalcharges	churn
0	5442-pptjy	male	0	yes	yes	12	yes	no	no	no_internet_service	...	no_internet_service	no_internet_service	no_internet_service	no_internet_service	two_year	no	mailed_check	19.70	258.35	0
1	6261-rcvns	female	0	no	no	42	yes	no	dsl	yes	...	yes	yes	no	yes	one_year	no	credit_card_(automatic)	73.90	3160.55	1
2	2176-osjuv	male	0	yes	no	71	yes	yes	dsl	yes	...	no	yes	no	no	two_year	no	bank_transfer_(automatic)	65.15	4681.75	0
3	6161-erdgd	male	0	yes	yes	71	yes	yes	dsl	yes	...	yes	yes	yes	yes	one_year	no	electronic_check	85.45	6300.85	0
4	2364-ufrom	male	0	no	no	30	yes	no	dsl	yes	...	no	yes	yes	no	one_year	no	electronic_check	70.40	2044.75	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5629	0781-lkxbr	male	1	no	no	9	yes	yes	fiber_optic	no	...	yes	no	yes	yes	month-to-month	yes	electronic_check	100.50	918.60	1
5630	3507-gasnp	male	0	no	yes	60	yes	no	no	no_internet_service	...	no_internet_service	no_internet_service	no_internet_service	no_internet_service	two_year	no	mailed_check	19.95	1189.90	0
5631	8868-wozgu	male	0	no	no	28	yes	yes	fiber_optic	no	...	yes	no	yes	yes	month-to-month	yes	electronic_check	105.70	2979.50	1
5632	1251-krreg	male	0	no	no	2	yes	yes	dsl	no	...	no	no	no	no	month-to-month	yes	mailed_check	54.40	114.10	1
5633	5840-nvdcg	female	0	yes	yes	16	yes	no	dsl	yes	...	no	yes	no	yes	two_year	no	bank_transfer_(automatic)	68.25	1114.85	0

	count	mean	diff	risk
gender
female	2796	0.276824	0.006856	1.025396
male	2838	0.263214	-0.006755	0.974980

	count	mean	diff	risk
partner
no	2932	0.329809	0.059841	1.221659
yes	2702	0.205033	-0.064935	0.759472

	count	mean	diff	risk
internetservice
dsl	1934	0.192347	-0.077621	0.712482
fiber_optic	2479	0.425171	0.155203	1.574895
no	1221	0.077805	-0.192163	0.288201

	count	mean	diff	risk
onlinesecurity
no	2801	0.420921	0.150953	1.559152
no_internet_service	1221	0.077805	-0.192163	0.288201
yes	1612	0.153226	-0.116742	0.567570

	count	mean	diff	risk
onlinebackup
no	2498	0.404323	0.134355	1.497672
no_internet_service	1221	0.077805	-0.192163	0.288201
yes	1915	0.217232	-0.052736	0.804660

	count	mean	diff	risk
deviceprotection
no	2473	0.395875	0.125907	1.466379
no_internet_service	1221	0.077805	-0.192163	0.288201
yes	1940	0.230412	-0.039556	0.853480

	count	mean	diff	risk
techsupport
no	2781	0.418914	0.148946	1.551717
no_internet_service	1221	0.077805	-0.192163	0.288201
yes	1632	0.159926	-0.110042	0.592390

	count	mean	diff	risk
streamingtv
no	2246	0.342832	0.072864	1.269897
no_internet_service	1221	0.077805	-0.192163	0.288201
yes	2167	0.302723	0.032755	1.121328

	count	mean	diff	risk
contract
month-to-month	3104	0.431701	0.161733	1.599082
one_year	1186	0.120573	-0.149395	0.446621
two_year	1344	0.028274	-0.241694	0.104730

	count	mean	diff	risk
paperlessbilling
no	2313	0.172071	-0.097897	0.637375
yes	3321	0.338151	0.068183	1.252560

	count	mean	diff	risk
paymentmethod
bank_transfer_(automatic)	1219	0.168171	-0.101797	0.622928
credit_card_(automatic)	1217	0.164339	-0.105630	0.608733
electronic_check	1893	0.455890	0.185922	1.688682
mailed_check	1305	0.193870	-0.076098	0.718121

	probability	prediction	actual
0	0.009003	0	0
1	0.204705	0	0
2	0.212365	0	0
3	0.542958	1	1
4	0.213888	0	0
...	...	...	...
1404	0.314074	0	0
1405	0.039389	0	1
1406	0.136440	0	0
1407	0.799790	1	1
1408	0.837434	1	1

	count	mean	diff	risk
streamingmovies
no	2213	0.338906	0.068938	1.255358
no_internet_service	1221	0.077805	-0.192163	0.288201
yes	2200	0.307273	0.037305	1.138182

	Predicted Negative (0)	Predicted Positive (1)
Actual Negative (0)	949 (TN)	74 (FP)
Actual Positive (1)	188 (FN)	198 (TP)

	threshold	true_positive	false_positive	true_negative	false_negative
0	0.0	386	1023	0	0
10	0.1	367	545	478	19
20	0.2	336	349	674	50
30	0.3	296	238	785	90
40	0.4	260	165	858	126
50	0.5	217	105	918	169
60	0.6	164	52	971	222
70	0.7	96	19	1004	290
80	0.8	19	2	1021	367
90	0.9	0	0	1023	386
100	1.0	0	0	1023	386

	threshold	true_positive	false_positive	true_negative	false_negative	tpr	fpr	diff
0	0.0	386	1023	0	0	1.000000	1.000000	0.000000
10	0.1	367	545	478	19	0.950777	0.532747	0.418030
20	0.2	336	349	674	50	0.870466	0.341153	0.529313
30	0.3	296	238	785	90	0.766839	0.232649	0.534190
40	0.4	260	165	858	126	0.673575	0.161290	0.512285
50	0.5	217	105	918	169	0.562176	0.102639	0.459537
60	0.6	164	52	971	222	0.424870	0.050831	0.374040
70	0.7	96	19	1004	290	0.248705	0.018573	0.230132
80	0.8	19	2	1021	367	0.049223	0.001955	0.047268
90	0.9	0	0	1023	386	0.000000	0.000000	0.000000
100	1.0	0	0	1023	386	0.000000	0.000000	0.000000