import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


#import data
data = pd.read_csv(r'/Users/shivambadkas/Downloads/liver.csv')


(data.head())
(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


#all features numeric except gender, classifier is 1 for liver disease and 2 for none
print(data.isnull().sum())

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64


#use scikit imputation to fill in missing values
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
data['Albumin_and_Globulin_Ratio']=imp.fit_transform(data[['Albumin_and_Globulin_Ratio']])


data.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64


#EDA: make correlation heatmap to see which features are correlated if any
X = data.drop(['Gender','Dataset'], axis=1)
plt.figure(figsize=(16, 6))
y = data['Dataset'] 
X.corr().style.background_gradient(cmap='coolwarm')

<Figure size 1152x432 with 0 Axes>


sns.countplot(data = data, x = 'Dataset', label='Count')
LD, NLD = data['Dataset'].value_counts()
print('Number of patients diagnosed with liver disease: ',LD)
print('Number of patients not diagnosed with liver disease: ',NLD)

Number of patients diagnosed with liver disease:  416
Number of patients not diagnosed with liver disease:  167


sns.countplot(data=data, x = 'Gender', label='Count')
M, F = data['Gender'].value_counts()
print('Number of patients that are male: ',M)
print('Number of patients that are female: ',F)

Number of patients that are male:  441
Number of patients that are female:  142


#import scikit modules
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score


#create cross validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

(408, 9)
(408,)
(175, 9)
(175,)


# Decision tree 
dt = DecisionTreeClassifier(max_depth=4,min_samples_leaf=0.14,random_state=1)
dt.fit(X_train,y_train)
dt_y_pred = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print('Test set accuracy of dt: {:.2f}'.format(dt_accuracy))

Test set accuracy of dt: 0.69


#Logistic Regression
lr = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter = 1000)
lr.fit(X_train, y_train)
lr_y_pred = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_y_pred)
print('Test set accuracy of dt: {:.2f}'.format(lr_accuracy))

Test set accuracy of dt: 0.70


#K-Nearest Neighbors
KNN_1 = KNN(n_neighbors = 10)
KNN_1.fit(X_train,y_train)
KNN_y_pred = KNN_1.predict(X_test)
KNN_accuracy = accuracy_score(y_test, KNN_y_pred)
print('Test set accuracy of dt: {:.2f}'.format(KNN_accuracy))

Test set accuracy of dt: 0.65


#Random Forest Classifier 
rf = RandomForestClassifier(n_estimators=25, random_state=1)
rf.fit(X_train,y_train)
rf_y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print('Test set accuracy of dt: {:.2f}'.format(rf_accuracy))

Test set accuracy of dt: 0.69


#Use random forest to see feature importance
importance = pd.Series(data=rf.feature_importances_,
                        index= X_train.columns)
#sort
importance_sorted = importance.sort_values()

importance_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()


#create tuple of tests to automate. Observe that the pattern follows a define -> fit -> predict -> test accuracy pipeline
classifiers = [('Decision Tree',dt),('Logistic Regression',lr),('KNN',KNN_1), ('Random Forest', rf)]

#for loop to iterate through 

for clf_name, clf in classifiers:
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))

Decision Tree : 0.686
Logistic Regression : 0.697
KNN : 0.651
Random Forest : 0.691


#Model Evaluation: ROC Curves; trade-off between recall and specificity, basically 0s get labelled as 1s when 
#you capture more 1s, ROC curve plots sensitivity on y-axis and specificity on y-axis. Need to sort probability 
#of each record being a 1 from most to least likely, then compute cumulative specificity and recall 
#first find probability of a record being 1 
y_score1 = dt.predict_proba(X_test)[:,1]
y_score2 = lr.predict_proba(X_test)[:,1]
y_score3 = KNN_1.predict_proba(X_test)[:,1]
y_score4 = rf.predict_proba(X_test)[:,1]

#Find AUC score, an ineffective classifier will have a value of 0.5. Larger AUC value means better classifier
print('ROC DecisionTree: ', roc_auc_score(y_test, y_score1))
print('ROC Logistic Regression: ', roc_auc_score(y_test, y_score2))
print('ROC KNN: ', roc_auc_score(y_test, y_score3))
print('ROC Random Forest: ', roc_auc_score(y_test, y_score4))

ROC DecisionTree:  0.6856419987349779
ROC Logistic Regression:  0.7365591397849462
ROC KNN:  0.6859582542694497
ROC Random Forest:  0.7283364958886781


results = pd.DataFrame({'Model': ['Decision Tree','Logistic Regression', 'KNN','Random Forest Classifier'],  'Roc_Score': [roc_auc_score(y_test, y_score1), roc_auc_score(y_test, y_score2),roc_auc_score(y_test, y_score3), roc_auc_score(y_test, y_score4)],
                       'Accuracy_Score': [dt_accuracy, lr_accuracy,KNN_accuracy, rf_accuracy]})
df_results = results.sort_values(by='Roc_Score', ascending=False)
df_results

	Age	Total_Bilirubin	Direct_Bilirubin	Alkaline_Phosphotase	Alamine_Aminotransferase	Aspartate_Aminotransferase	Total_Protiens	Albumin	Albumin_and_Globulin_Ratio
Age	1.000000	0.011763	0.007529	0.080425	-0.086883	-0.019910	-0.187461	-0.265924	-0.216089
Total_Bilirubin	0.011763	1.000000	0.874618	0.206669	0.214065	0.237831	-0.008099	-0.222250	-0.206159
Direct_Bilirubin	0.007529	0.874618	1.000000	0.234939	0.233894	0.257544	-0.000139	-0.228531	-0.200004
Alkaline_Phosphotase	0.080425	0.206669	0.234939	1.000000	0.125680	0.167196	-0.028514	-0.165453	-0.233960
Alamine_Aminotransferase	-0.086883	0.214065	0.233894	0.125680	1.000000	0.791966	-0.042518	-0.029742	-0.002374
Aspartate_Aminotransferase	-0.019910	0.237831	0.257544	0.167196	0.791966	1.000000	-0.025645	-0.085290	-0.070024
Total_Protiens	-0.187461	-0.008099	-0.000139	-0.028514	-0.042518	-0.025645	1.000000	0.784053	0.233904
Albumin	-0.265924	-0.222250	-0.228531	-0.165453	-0.029742	-0.085290	0.784053	1.000000	0.686322
Albumin_and_Globulin_Ratio	-0.216089	-0.206159	-0.200004	-0.233960	-0.002374	-0.070024	0.233904	0.686322	1.000000

	Model	Roc_Score	Accuracy_Score
1	Logistic Regression	0.736559	0.697143
3	Random Forest Classifier	0.728336	0.691429
2	KNN	0.685958	0.651429
0	Decision Tree	0.685642	0.685714