import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import data from a csv file into a dataframe
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()

# Produce a summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB

# Convert Total charges to numeric, any values that cannot be converted are replaced with NaN.
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Check for null values
df.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

# Replace the 11 null values with the median for total charges, so as not to skew the results of the analysis
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Basic descriptive statistics for the numerical columns in the data set
df.describe()

print(df.columns)

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

# Drop unnecessary columns
df.drop(columns=["customerID"], inplace=True)

# Check for and drop duplicate rows in the data
df.duplicated().sum()
df.drop_duplicates(inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7021 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7021 non-null   object 
 1   SeniorCitizen     7021 non-null   int64  
 2   Partner           7021 non-null   object 
 3   Dependents        7021 non-null   object 
 4   tenure            7021 non-null   int64  
 5   PhoneService      7021 non-null   object 
 6   MultipleLines     7021 non-null   object 
 7   InternetService   7021 non-null   object 
 8   OnlineSecurity    7021 non-null   object 
 9   OnlineBackup      7021 non-null   object 
 10  DeviceProtection  7021 non-null   object 
 11  TechSupport       7021 non-null   object 
 12  StreamingTV       7021 non-null   object 
 13  StreamingMovies   7021 non-null   object 
 14  Contract          7021 non-null   object 
 15  PaperlessBilling  7021 non-null   object 
 16  PaymentMethod     7021 non-null   object 
 17  MonthlyCharges    7021 non-null   float64
 18  TotalCharges      7021 non-null   float64
 19  Churn             7021 non-null   object 
dtypes: float64(2), int64(2), object(16)
memory usage: 1.1+ MB

# Visualize the Churn variable. Yes indicates a customer that has left within the last month.
sns.countplot(x="Churn", data=df)
plt.title("Distribution of churn")
plt.show()

# Calculate percentage of churn vs. non-churn
churn_rate = df["Churn"].value_counts(normalize=True) * 100
print(churn_rate)

Churn
No     73.550776
Yes    26.449224
Name: proportion, dtype: float64

# Grouped bar plot
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', hue='Churn', data=df)
plt.title('Distribution of Gender and Churn')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Churn', loc='upper right', labels=['No Churn', 'Churn'])
plt.show()

# Visualize the distribution of churn by contract type
sns.countplot(x="Contract", hue="Churn", data=df)
plt.title("Churn Rate by Contract Type")
plt.show()

# Visualize distribution of churn by the type of internet service
sns.countplot(x="InternetService", hue="Churn", data=df)
plt.title("Churn Rate by Internet Service Type")
plt.show()

# Visualize distribution of churn by payment method
sns.countplot(x="PaymentMethod", hue="Churn", data=df)
plt.xticks(rotation=15)  # Rotate x-axis labels
plt.title("Churn Rate by payment method")
plt.show()

# Visualize distribution of churn by online security
sns.countplot(x="OnlineSecurity", hue="Churn", data=df)
plt.title("Churn Rate by online security")
plt.show()

# Boxplot for TotalCharges
sns.boxplot(x="Churn", y="MonthlyCharges", data=df)
plt.title("Distribution of MonthlyCharges by Churn")
plt.show()

plt.figure(figsize=(15, 5))

# Plot histogram for "tenure"
plt.subplot(1, 3, 1)
plt.hist(df["tenure"], bins=20, color="skyblue", edgecolor="black")
plt.title("Distribution of Tenure")
plt.xlabel("Tenure (Months)")
plt.ylabel("Frequency")

# Plot histogram for "MonthlyCharges"
plt.subplot(1, 3, 2)
plt.hist(df["MonthlyCharges"], bins=20, color="orange", edgecolor="black")
plt.title("Distribution of Monthly Charges")
plt.xlabel("Monthly Charges")
plt.ylabel("Frequency")

# Plot histogram for "TotalCharges"
plt.subplot(1, 3, 3)
plt.hist(df["TotalCharges"], bins=20, color="green", edgecolor="black")
plt.title("Distribution of Total Charges")
plt.xlabel("Total Charges")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

# Create tenure groups and compare them
df["TenureGroup"] = pd.cut(df["tenure"], bins=[0, 12, 24, 48, 60, 72], 
                           labels=["0-12 Months", "12-24 Months", "24-48 Months", "48-60 Months", "60-72 Months"])


sns.countplot(x="TenureGroup", hue="Churn", data=df)
plt.title("Churn Distribution Across Tenure Groups")
plt.xlabel("Tenure Group")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()


# Once we"re done with the tenure group analysis, the column is dropped because purely numerical data is needed for further analysis
df.drop(columns=["TenureGroup"], inplace=True)

# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object"]).columns
print("Categorical columns:", categorical_cols)

# Identify binary columns
binary_columns = [col for col in categorical_cols if df[col].nunique() == 2]
print("Binary columns:", binary_columns)

# Check unique values in binary columns
for col in binary_columns:
    print(f"{col}: {df[col].unique()}")
    
# Encode binary categorical columns
binary_mapping = {"Yes": 1, "No": 0, "Female": 0, "Male": 1}
for col in binary_columns:
    df[col] = df[col].map(binary_mapping)

Categorical columns: Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')
Binary columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
gender: ['Female' 'Male']
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
PaperlessBilling: ['Yes' 'No']
Churn: ['No' 'Yes']

# Use one-hot encoding for non-binary categorical variables
non_binary_columns = [col for col in categorical_cols if col not in binary_columns]
print("Non-Binary Categorical Columns:", non_binary_columns)

df = pd.get_dummies(df, columns=non_binary_columns, drop_first=True)

# Convert the column values to binary (1/0) instead of boolean (True/False)
dummy_columns = df.select_dtypes(include=["bool"]).columns
df[dummy_columns] = df[dummy_columns].astype(int)

# Verify changes
df.head()

Non-Binary Categorical Columns: ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

# Compute and visualize a correlation matrix
plt.figure(figsize=(14, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

# Based on the correlation matrix, we can have a closer look at some correlations

correlation1 = df["tenure"].corr(df["TotalCharges"])
correlation2 = df["MonthlyCharges"].corr(df["Churn"])
correlation3 = df["tenure"].corr(df["Churn"])

print(f"The correlation between tenure and total charges is {correlation1:.2f}")
print(f"The correlation between churn and monthly charges is {correlation2:.2f}")
print(f"The correlation between tenure and churn is {correlation3:.2f}")

The correlation between tenure and total charges is 0.82
The correlation between churn and monthly charges is 0.19
The correlation between tenure and churn is -0.35

from sklearn.preprocessing import MinMaxScaler

# Select continuous features for scaling
scaler = MinMaxScaler()
scaled_columns = ["tenure", "MonthlyCharges", "TotalCharges"]
df[scaled_columns] = scaler.fit_transform(df[scaled_columns])

# Create TotalCharges per Month feature
df["ChargesPerMonth"] = df["TotalCharges"] / (df["tenure"] + 1e-5)  # Avoid division by zero

# Interaction between MonthlyCharges and Contract
df["MonthlyContractInteraction"] = df["MonthlyCharges"] * df["Contract_Two year"]

df.head()

# Check correlation with Churn again
numeric_df = df.select_dtypes(include=["number"])
correlation_with_churn = numeric_df.corr()["Churn"].sort_values(ascending=False)
print(correlation_with_churn)

Churn                                    1.000000
InternetService_Fiber optic              0.308170
PaymentMethod_Electronic check           0.301544
MonthlyCharges                           0.194508
PaperlessBilling                         0.190891
SeniorCitizen                            0.151619
StreamingTV_Yes                          0.065032
StreamingMovies_Yes                      0.063192
MultipleLines_Yes                        0.041958
PhoneService                             0.011323
gender                                  -0.008763
MultipleLines_No phone service          -0.011323
ChargesPerMonth                         -0.023700
DeviceProtection_Yes                    -0.064944
OnlineBackup_Yes                        -0.081092
PaymentMethod_Mailed check              -0.092562
PaymentMethod_Credit card (automatic)   -0.133666
Partner                                 -0.149135
Dependents                              -0.163459
TechSupport_Yes                         -0.163937
OnlineSecurity_Yes                      -0.170520
Contract_One year                       -0.177336
TotalCharges                            -0.197911
MonthlyContractInteraction              -0.204353
StreamingMovies_No internet service     -0.228533
StreamingTV_No internet service         -0.228533
TechSupport_No internet service         -0.228533
DeviceProtection_No internet service    -0.228533
OnlineSecurity_No internet service      -0.228533
InternetService_No                      -0.228533
OnlineBackup_No internet service        -0.228533
Contract_Two year                       -0.302076
tenure                                  -0.351508
Name: Churn, dtype: float64

# Create a new column "NoInternetService" that is 1 if any of the relevant columns indicate no internet service, otherwise 0
df["NoInternetService"] = df[["StreamingMovies_No internet service", 
                              "StreamingTV_No internet service",
                              "TechSupport_No internet service",
                              "DeviceProtection_No internet service",
                              "OnlineSecurity_No internet service",
                              "InternetService_No",
                              "OnlineBackup_No internet service"]].max(axis=1)

# Drop the original columns after combining
columns_to_drop = [
    "StreamingMovies_No internet service",
    "StreamingTV_No internet service",
    "TechSupport_No internet service",
    "DeviceProtection_No internet service",
    "OnlineSecurity_No internet service",
    "InternetService_No",
    "OnlineBackup_No internet service"
]
df.drop(columns=columns_to_drop, inplace=True)

# Save the processed data
df.to_csv("processed_telco_churn.csv", index=False)

# Separate features and target, this prepares the data for modeling
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# try different models to find the one that performs best with the data

# 1. Logistic regression
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)


# 2. Random forest classifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


# 3. Support Vector Machine (SVM)

from sklearn.svm import SVC

svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train, y_train)

SVC(kernel='linear', probability=True)

SVC(kernel='linear', probability=True)

from sklearn.metrics import classification_report, accuracy_score

# Logistic regression
y_pred1 = lr_model.predict(X_test)
print("Logistic Regression:")
print(classification_report(y_test, y_pred1))
print("Accuracy:", accuracy_score(y_test, y_pred1))

Logistic Regression:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1556
           1       0.64      0.54      0.59       551

    accuracy                           0.80      2107
   macro avg       0.74      0.72      0.73      2107
weighted avg       0.79      0.80      0.80      2107

Accuracy: 0.8011390602752729

# Random forest classifier

y_pred2 = rf_model.predict(X_test)
print("Random forest classifier:")
print(classification_report(y_test, y_pred2))
print("Accuracy:", accuracy_score(y_test, y_pred2))

Random forest classifier:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1556
           1       0.62      0.49      0.55       551

    accuracy                           0.79      2107
   macro avg       0.73      0.69      0.71      2107
weighted avg       0.78      0.79      0.78      2107

Accuracy: 0.7883246321784527

# Support Vector Machine (SVM)

y_pred3 = svm_model.predict(X_test)
print("Support Vector Machine:")
print(classification_report(y_test, y_pred3))
print("Accuracy:", accuracy_score(y_test, y_pred3))

Support Vector Machine:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1556
           1       0.64      0.53      0.58       551

    accuracy                           0.80      2107
   macro avg       0.74      0.71      0.73      2107
weighted avg       0.79      0.80      0.79      2107

Accuracy: 0.8011390602752729

from sklearn.metrics import confusion_matrix

# Confusion matrix for logistic regression
cm1 = confusion_matrix(y_test, y_pred1)
sns.heatmap(cm1, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix for Logistic Regression")
plt.show()

# Confusion matrix for ranfom forest classifier

cm2 = confusion_matrix(y_test, y_pred2)
sns.heatmap(cm2, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix for Random forest classifier")
plt.show()

# Confusion matrix Support Vector Machine

cm3 = confusion_matrix(y_test, y_pred3)
sns.heatmap(cm3, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix for Support Vector Machine")
plt.show()

# Cross-validation is used to examine whether the model generalizes well

from sklearn.model_selection import cross_val_score

cv_scores1 = cross_val_score(lr_model, X_train, y_train, cv=5)
print(f"Logistic Regression Cross-Validation Scores: {cv_scores1}")
print(f"Mean CV Accuracy: {cv_scores1.mean()}")
print("")
cv_scores2 = cross_val_score(rf_model, X_train, y_train, cv=5)
print(f"Random forest feature Cross-Validation Scores: {cv_scores2}")
print(f"Mean CV Accuracy: {cv_scores2.mean()}")
print("")
cv_scores3 = cross_val_score(svm_model, X_train, y_train, cv=5)
print(f"Support Vector Machine Cross-Validation Scores: {cv_scores3}")
print(f"Mean CV Accuracy: {cv_scores3.mean()}")

Logistic Regression Cross-Validation Scores: [0.8138352  0.81892167 0.78942014 0.80061038 0.80040733]
Mean CV Accuracy: 0.804638943505997

Random forest feature Cross-Validation Scores: [0.79755849 0.80366226 0.79755849 0.78433367 0.79124236]
Mean CV Accuracy: 0.7948710564318465

Support Vector Machine Cross-Validation Scores: [0.80366226 0.81485249 0.78840285 0.80366226 0.80040733]
Mean CV Accuracy: 0.8021974379108799

# Examine the most important features, according to the random forest feature model

# Get feature importance from Random Forest
importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
# Sort by importance
importances = importances.sort_values(ascending=False)

# Plot
plt.figure(figsize=(12, 8))
sns.barplot(x=importances, y=importances.index, palette="viridis")
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()

# Examine the most important features according to the logistic regression model

importance = lr_model.coef_[0]
features = df.drop('Churn', axis=1).columns

# Create a DataFrame for visualization
coef_df = pd.DataFrame({'Feature': features, 'Coefficient': importance})

# Sort by the absolute value of the coefficients to show the most influential features first
coef_df['AbsoluteCoefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='AbsoluteCoefficient', ascending=False)

# Plot
plt.figure(figsize=(12, 8))
sns.barplot(x='Coefficient', y='Feature', data=coef_df, palette='coolwarm', orient='h')
plt.title('Feature Coefficients (Logistic Regression)')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.axvline(0, color='black', linewidth=0.8)  # Line to separate positive/negative influence
plt.show()

# ROC curve for the logistic regression model

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_pred_proba = lr_model.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class (Churn = 1)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)  # Area Under the Curve

# Plot the ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"Logistic Regression (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray", lw=2)  # Diagonal reference line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Logistic Regression")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

	SeniorCitizen	tenure	MonthlyCharges	TotalCharges
count	7043.000000	7043.000000	7043.000000	7043.000000
mean	0.162147	32.371149	64.761692	2281.916928
std	0.368612	24.559481	30.090047	2265.270398
min	0.000000	0.000000	18.250000	18.800000
25%	0.000000	9.000000	35.500000	402.225000
50%	0.000000	29.000000	70.350000	1397.475000
75%	0.000000	55.000000	89.850000	3786.600000
max	1.000000	72.000000	118.750000	8684.800000

	gender	Partner	tenure	PhoneService	PaperlessBilling	MonthlyCharges	TotalCharges	Churn	...	Contract_One year	PaymentMethod_Electronic check	PaymentMethod_Mailed check	ChargesPerMonth
0	0	1	0.013889	0	1	0.115423	0.001275	0	...	0	1	0	0.091741
1	1	0	0.472222	1	0	0.385075	0.215867	0	...	1	0	1	0.457120
2	1	0	0.027778	1	1	0.354229	0.010310	1	...	0	0	1	0.371041
3	1	0	0.625000	0	0	0.239303	0.210241	0	...	1	0	0	0.336380
4	0	0	0.027778	1	1	0.521891	0.015330	1	...	0	1	0	0.551682

Customer churn analysis¶

1. Loading and cleaning the dataset¶

2. Exploratory data analysis¶

Some notes on how to interpret the correlation matrix.¶

3. Feature engineering¶

4. Model building¶

The models¶

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	...	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes