# Load libraries
import numpy as np
import pandas as pd
import researchpy as rp
import matplotlib.pyplot as plt
import seaborn as sns
Hepatitis Risk Assessment: Predictive Modeling with Machine Learning
Problem
Hepatitis is a significant global health concern, affecting millions of individuals and posing a considerable burden on healthcare systems. Early detection and accurate prediction of hepatitis can lead to timely interventions, improved patient outcomes, and effective public health strategies. However, traditional diagnostic methods often have limitations in terms of accuracy and efficiency. There is a pressing need for a machine learning-based approach that can effectively predict the risk of hepatitis and aid in early identification of individuals who are likely to develop the disease.
Objective
Develop a machine learning model to accurately predict the risk of hepatitis by analyzing relevant factors, aiding in early identification and intervention.
# Load dataset
# Colnames
= ["Class","AGE","SEX","STEROID","ANTIVIRALS","FATIGUE","MALAISE","ANOREXIA","LIVER BIG",
col_names "LIVER FIRM","SPLEEN PALPABLE","SPIDERS","ASCITES","VARICES","BILIRUBIN","ALK PHOSPHATE",
"SGOT","ALBUMIN","PROTIME","HISTOLOGY"]
= pd.read_csv('../data//hepatitis.data', names=col_names)
data data.head()
Class | AGE | SEX | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | SPIDERS | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 30 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 1.00 | 85 | 18 | 4.0 | ? | 1 |
1 | 2 | 50 | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 0.90 | 135 | 42 | 3.5 | ? | 1 |
2 | 2 | 78 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.70 | 96 | 32 | 4.0 | ? | 1 |
3 | 2 | 31 | 1 | ? | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.70 | 46 | 52 | 4.0 | 80 | 1 |
4 | 2 | 34 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1.00 | ? | 200 | 4.0 | ? | 1 |
Exploring Data
# check shape of data
data.shape
(155, 20)
# dtypes
data.dtypes
Class int64
AGE int64
SEX int64
STEROID object
ANTIVIRALS int64
FATIGUE object
MALAISE object
ANOREXIA object
LIVER BIG object
LIVER FIRM object
SPLEEN PALPABLE object
SPIDERS object
ASCITES object
VARICES object
BILIRUBIN object
ALK PHOSPHATE object
SGOT object
ALBUMIN object
PROTIME object
HISTOLOGY int64
dtype: object
# info
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Class 155 non-null int64
1 AGE 155 non-null int64
2 SEX 155 non-null int64
3 STEROID 155 non-null object
4 ANTIVIRALS 155 non-null int64
5 FATIGUE 155 non-null object
6 MALAISE 155 non-null object
7 ANOREXIA 155 non-null object
8 LIVER BIG 155 non-null object
9 LIVER FIRM 155 non-null object
10 SPLEEN PALPABLE 155 non-null object
11 SPIDERS 155 non-null object
12 ASCITES 155 non-null object
13 VARICES 155 non-null object
14 BILIRUBIN 155 non-null object
15 ALK PHOSPHATE 155 non-null object
16 SGOT 155 non-null object
17 ALBUMIN 155 non-null object
18 PROTIME 155 non-null object
19 HISTOLOGY 155 non-null int64
dtypes: int64(5), object(15)
memory usage: 24.3+ KB
# Replace ? with 0
= data.replace('?',0)
data data.head()
Class | AGE | SEX | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | SPIDERS | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 30 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 1.00 | 85 | 18 | 4.0 | 0 | 1 |
1 | 2 | 50 | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 0.90 | 135 | 42 | 3.5 | 0 | 1 |
2 | 2 | 78 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.70 | 96 | 32 | 4.0 | 0 | 1 |
3 | 2 | 31 | 1 | 0 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.70 | 46 | 52 | 4.0 | 80 | 1 |
4 | 2 | 34 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1.00 | 0 | 200 | 4.0 | 0 | 1 |
# Convert from one dt to another
== 'object']
data.columns[data.dtypes 'STEROID', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM','SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'BILIRUBIN',
data[['ALK PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME']] = data[['STEROID', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM','SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'BILIRUBIN',
'ALK PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME']].astype(float).astype(int)
data.head()
Class | AGE | SEX | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | SPIDERS | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 30 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 1 | 85 | 18 | 4 | 0 | 1 |
1 | 2 | 50 | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 0 | 135 | 42 | 3 | 0 | 1 |
2 | 2 | 78 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0 | 96 | 32 | 4 | 0 | 1 |
3 | 2 | 31 | 1 | 0 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0 | 46 | 52 | 4 | 80 | 1 |
4 | 2 | 34 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1 | 0 | 200 | 4 | 0 | 1 |
'BILIRUBIN','ALBUMIN']] = data[['BILIRUBIN','ALBUMIN']].astype(float)
data[[ data.head()
Class | AGE | SEX | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | SPIDERS | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 30 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 1.0 | 85 | 18 | 4.0 | 0 | 1 |
1 | 2 | 50 | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 0.0 | 135 | 42 | 3.0 | 0 | 1 |
2 | 2 | 78 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 96 | 32 | 4.0 | 0 | 1 |
3 | 2 | 31 | 1 | 0 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 46 | 52 | 4.0 | 80 | 1 |
4 | 2 | 34 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1.0 | 0 | 200 | 4.0 | 0 | 1 |
'Class','SEX']] = data[['Class','SEX']].astype(object)
data[[ data.head()
Class | AGE | SEX | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | SPIDERS | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 30 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 1.0 | 85 | 18 | 4.0 | 0 | 1 |
1 | 2 | 50 | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 0.0 | 135 | 42 | 3.0 | 0 | 1 |
2 | 2 | 78 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 96 | 32 | 4.0 | 0 | 1 |
3 | 2 | 31 | 1 | 0 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 46 | 52 | 4.0 | 80 | 1 |
4 | 2 | 34 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1.0 | 0 | 200 | 4.0 | 0 | 1 |
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Class 155 non-null object
1 AGE 155 non-null int64
2 SEX 155 non-null object
3 STEROID 155 non-null int32
4 ANTIVIRALS 155 non-null int64
5 FATIGUE 155 non-null int32
6 MALAISE 155 non-null int32
7 ANOREXIA 155 non-null int32
8 LIVER BIG 155 non-null int32
9 LIVER FIRM 155 non-null int32
10 SPLEEN PALPABLE 155 non-null int32
11 SPIDERS 155 non-null int32
12 ASCITES 155 non-null int32
13 VARICES 155 non-null int32
14 BILIRUBIN 155 non-null float64
15 ALK PHOSPHATE 155 non-null int32
16 SGOT 155 non-null int32
17 ALBUMIN 155 non-null float64
18 PROTIME 155 non-null int32
19 HISTOLOGY 155 non-null int64
dtypes: float64(2), int32(13), int64(3), object(2)
memory usage: 16.5+ KB
# check missing data
sum() data.isnull().
Class 0
AGE 0
SEX 0
STEROID 0
ANTIVIRALS 0
FATIGUE 0
MALAISE 0
ANOREXIA 0
LIVER BIG 0
LIVER FIRM 0
SPLEEN PALPABLE 0
SPIDERS 0
ASCITES 0
VARICES 0
BILIRUBIN 0
ALK PHOSPHATE 0
SGOT 0
ALBUMIN 0
PROTIME 0
HISTOLOGY 0
dtype: int64
Descriptive statistics
# select numeric data
= data.select_dtypes(exclude = 'object')
num_cols num_cols.head()
AGE | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | SPIDERS | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 1.0 | 85 | 18 | 4.0 | 0 | 1 |
1 | 50 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 0.0 | 135 | 42 | 3.0 | 0 | 1 |
2 | 78 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 96 | 32 | 4.0 | 0 | 1 |
3 | 31 | 0 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 46 | 52 | 4.0 | 80 | 1 |
4 | 34 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1.0 | 0 | 200 | 4.0 | 0 | 1 |
# summary statistics of numerical variables
'AGE', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA',
rp.summary_cont(num_cols[['LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS', 'ASCITES',
'VARICES', 'BILIRUBIN', 'ALK PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME',
'HISTOLOGY']])
C:\Users\JHossain\anaconda3\lib\site-packages\researchpy\summary.py:60: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
for ix, df_col in group1.iteritems():
Variable | N | Mean | SD | SE | 95% Conf. | Interval | |
---|---|---|---|---|---|---|---|
0 | AGE | 155.0 | 41.2000 | 12.5659 | 1.0093 | 39.2061 | 43.1939 |
1 | STEROID | 155.0 | 1.4968 | 0.5144 | 0.0413 | 1.4152 | 1.5784 |
2 | ANTIVIRALS | 155.0 | 1.8452 | 0.3629 | 0.0292 | 1.7876 | 1.9027 |
3 | FATIGUE | 155.0 | 1.3419 | 0.4894 | 0.0393 | 1.2643 | 1.4196 |
4 | MALAISE | 155.0 | 1.5935 | 0.5058 | 0.0406 | 1.5133 | 1.6738 |
5 | ANOREXIA | 155.0 | 1.7806 | 0.4305 | 0.0346 | 1.7123 | 1.8490 |
6 | LIVER BIG | 155.0 | 1.7097 | 0.5807 | 0.0466 | 1.6175 | 1.8018 |
7 | LIVER FIRM | 155.0 | 1.4710 | 0.6274 | 0.0504 | 1.3714 | 1.5705 |
8 | SPLEEN PALPABLE | 155.0 | 1.7419 | 0.5076 | 0.0408 | 1.6614 | 1.8225 |
9 | SPIDERS | 155.0 | 1.6065 | 0.5524 | 0.0444 | 1.5188 | 1.6941 |
10 | ASCITES | 155.0 | 1.8065 | 0.4712 | 0.0378 | 1.7317 | 1.8812 |
11 | VARICES | 155.0 | 1.8194 | 0.4625 | 0.0371 | 1.7460 | 1.8927 |
12 | BILIRUBIN | 155.0 | 0.9355 | 1.2674 | 0.1018 | 0.7344 | 1.1366 |
13 | ALK PHOSPHATE | 155.0 | 85.6194 | 62.0617 | 4.9849 | 75.7717 | 95.4670 |
14 | SGOT | 155.0 | 83.6774 | 89.5277 | 7.1910 | 69.4716 | 97.8832 |
15 | ALBUMIN | 155.0 | 3.0839 | 1.2739 | 0.1023 | 2.8817 | 3.2860 |
16 | PROTIME | 155.0 | 35.1161 | 35.2219 | 2.8291 | 29.5273 | 40.7050 |
17 | HISTOLOGY | 155.0 | 1.4516 | 0.4993 | 0.0401 | 1.3724 | 1.5308 |
# select categorical data
= data.select_dtypes(include = 'object')
cat_cols cat_cols.head()
Class | SEX | |
---|---|---|
0 | 2 | 2 |
1 | 2 | 1 |
2 | 2 | 1 |
3 | 2 | 1 |
4 | 2 | 1 |
# summary statistics of categorical variables
'Class', 'SEX']]) rp.summary_cat(cat_cols[[
C:\Users\JHossain\anaconda3\lib\site-packages\researchpy\summary.py:225: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
for ix, df_col in group1.iteritems():
Variable | Outcome | Count | Percent | |
---|---|---|---|---|
0 | Class | 2 | 123 | 79.35 |
1 | 1 | 32 | 20.65 | |
2 | SEX | 1 | 139 | 89.68 |
3 | 2 | 16 | 10.32 |
Correlations between Variables
# correlation: Pearson’s by default
='pearson') data.corr(method
C:\Users\JHossain\AppData\Local\Temp\ipykernel_20536\427603040.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
data.corr(method='pearson')
AGE | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | SPIDERS | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AGE | 1.000000 | -0.067710 | -0.013100 | -0.264635 | -0.099516 | 0.100589 | -0.126358 | -0.100979 | -0.127258 | -0.206551 | -0.125020 | -0.114412 | 0.123546 | 0.045328 | 0.078021 | -0.215645 | -0.059956 | 0.170780 |
STEROID | -0.067710 | 1.000000 | 0.101657 | 0.146292 | 0.232041 | 0.084752 | 0.159877 | 0.095280 | 0.071401 | 0.029781 | -0.002593 | -0.029759 | -0.000321 | -0.042245 | -0.021032 | 0.203561 | 0.032994 | -0.070144 |
ANTIVIRALS | -0.013100 | 0.101657 | 1.000000 | -0.065578 | -0.026703 | -0.052554 | -0.060626 | -0.019871 | -0.183070 | -0.176367 | -0.176382 | -0.167721 | 0.161671 | 0.041764 | 0.095980 | -0.154318 | -0.276452 | 0.209242 |
FATIGUE | -0.264635 | 0.146292 | -0.065578 | 1.000000 | 0.617660 | 0.419991 | 0.168796 | 0.254621 | 0.226850 | 0.380935 | 0.288877 | 0.217311 | -0.246897 | -0.110505 | -0.175476 | 0.234946 | 0.073783 | -0.131177 |
MALAISE | -0.099516 | 0.232041 | -0.026703 | 0.617660 | 1.000000 | 0.631662 | 0.192556 | 0.197903 | 0.145236 | 0.376666 | 0.376182 | 0.267029 | -0.253914 | -0.114810 | -0.177297 | 0.274981 | 0.067186 | -0.116964 |
ANOREXIA | 0.100589 | 0.084752 | -0.052554 | 0.419991 | 0.631662 | 1.000000 | 0.185164 | 0.144556 | 0.185004 | 0.371874 | 0.301514 | 0.288883 | -0.240332 | 0.035984 | -0.188183 | 0.152168 | 0.108322 | -0.049702 |
LIVER BIG | -0.126358 | 0.159877 | -0.060626 | 0.168796 | 0.192556 | 0.185164 | 1.000000 | 0.698517 | 0.338963 | 0.370229 | 0.339108 | 0.238645 | -0.043260 | 0.018895 | -0.026293 | 0.103349 | 0.174677 | -0.104757 |
LIVER FIRM | -0.100979 | 0.095280 | -0.019871 | 0.254621 | 0.197903 | 0.144556 | 0.698517 | 1.000000 | 0.322955 | 0.407114 | 0.310338 | 0.295100 | -0.075869 | -0.179977 | -0.149876 | 0.096499 | 0.083312 | -0.206632 |
SPLEEN PALPABLE | -0.127258 | 0.071401 | -0.183070 | 0.226850 | 0.145236 | 0.185004 | 0.338963 | 0.322955 | 1.000000 | 0.515462 | 0.495691 | 0.574616 | -0.177461 | -0.160211 | -0.116444 | 0.204411 | 0.124451 | -0.100840 |
SPIDERS | -0.206551 | 0.029781 | -0.176367 | 0.380935 | 0.376666 | 0.371874 | 0.370229 | 0.407114 | 0.515462 | 1.000000 | 0.553650 | 0.609491 | -0.240555 | -0.136225 | -0.180494 | 0.185624 | 0.204276 | -0.246077 |
ASCITES | -0.125020 | -0.002593 | -0.176382 | 0.288877 | 0.376182 | 0.301514 | 0.339108 | 0.310338 | 0.495691 | 0.553650 | 1.000000 | 0.672806 | -0.238515 | -0.058935 | -0.125707 | 0.200303 | 0.120694 | -0.178075 |
VARICES | -0.114412 | -0.029759 | -0.167721 | 0.217311 | 0.267029 | 0.288883 | 0.238645 | 0.295100 | 0.574616 | 0.609491 | 0.672806 | 1.000000 | -0.285886 | -0.148553 | -0.160748 | 0.169160 | 0.124468 | -0.178705 |
BILIRUBIN | 0.123546 | -0.000321 | 0.161671 | -0.246897 | -0.253914 | -0.240332 | -0.043260 | -0.075869 | -0.177461 | -0.240555 | -0.238515 | -0.285886 | 1.000000 | 0.036589 | 0.227189 | -0.153487 | -0.136134 | 0.261856 |
ALK PHOSPHATE | 0.045328 | -0.042245 | 0.041764 | -0.110505 | -0.114810 | 0.035984 | 0.018895 | -0.179977 | -0.160211 | -0.136225 | -0.058935 | -0.148553 | 0.036589 | 1.000000 | 0.141202 | 0.024143 | 0.135946 | 0.126923 |
SGOT | 0.078021 | -0.021032 | 0.095980 | -0.175476 | -0.177297 | -0.188183 | -0.026293 | -0.149876 | -0.116444 | -0.180494 | -0.125707 | -0.160748 | 0.227189 | 0.141202 | 1.000000 | -0.112325 | -0.038869 | 0.138095 |
ALBUMIN | -0.215645 | 0.203561 | -0.154318 | 0.234946 | 0.274981 | 0.152168 | 0.103349 | 0.096499 | 0.204411 | 0.185624 | 0.200303 | 0.169160 | -0.153487 | 0.024143 | -0.112325 | 1.000000 | 0.337852 | -0.151829 |
PROTIME | -0.059956 | 0.032994 | -0.276452 | 0.073783 | 0.067186 | 0.108322 | 0.174677 | 0.083312 | 0.124451 | 0.204276 | 0.120694 | 0.124468 | -0.136134 | 0.135946 | -0.038869 | 0.337852 | 1.000000 | -0.154030 |
HISTOLOGY | 0.170780 | -0.070144 | 0.209242 | -0.131177 | -0.116964 | -0.049702 | -0.104757 | -0.206632 | -0.100840 | -0.246077 | -0.178075 | -0.178705 | 0.261856 | 0.126923 | 0.138095 | -0.151829 | -0.154030 | 1.000000 |
Skewness
# skew
data.skew()
Class -1.464700
AGE 0.365294
SEX 2.633738
STEROID -0.131978
ANTIVIRALS -1.926980
FATIGUE 0.503794
MALAISE -0.536814
ANOREXIA -1.614447
LIVER BIG -1.880107
LIVER FIRM -0.768247
SPLEEN PALPABLE -1.841965
SPIDERS -1.017367
ASCITES -2.453535
VARICES -2.608785
BILIRUBIN 2.526856
ALK PHOSPHATE 0.692180
SGOT 3.159754
ALBUMIN -1.266422
PROTIME 0.369567
HISTOLOGY 0.196367
dtype: float64
Data visualizations
# Univariate distributions with histogram
= "object").hist(figsize=(20,15), edgecolor='black')
data.select_dtypes(exclude plt.show()
# Univariate distributions with density plot
= "object").plot(kind='density', subplots=True, sharex=False, figsize=(20,15), layout=(4,5))
data.select_dtypes(exclude plt.show()
# Univariate distributions with box plots
= "object").plot(kind='box', subplots=True, sharex=False, figsize=(20,15), layout=(4,5))
data.select_dtypes(exclude plt.show()
# Multivariate plots with correlations
=(20,15))
plt.figure(figsize= data.corr()
corr =True)
sns.heatmap(corr, annot plt.show()
C:\Users\JHossain\AppData\Local\Temp\ipykernel_20536\667096660.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
corr = data.corr()
Setup
# exmine first few rows of data
data.head()
Class | AGE | SEX | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | SPIDERS | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 30 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 1.0 | 85 | 18 | 4.0 | 0 | 1 |
1 | 2 | 50 | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 0.0 | 135 | 42 | 3.0 | 0 | 1 |
2 | 2 | 78 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 96 | 32 | 4.0 | 0 | 1 |
3 | 2 | 31 | 1 | 0 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 46 | 52 | 4.0 | 80 | 1 |
4 | 2 | 34 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1.0 | 0 | 200 | 4.0 | 0 | 1 |
# import pycaret classification and init setup
from pycaret.classification import *
= 'Class', session_id = 123) setup(data, target
Description | Value | |
---|---|---|
0 | Session id | 123 |
1 | Target | Class |
2 | Target type | Binary |
3 | Target mapping | 1: 0, 2: 1 |
4 | Original data shape | (155, 20) |
5 | Transformed data shape | (155, 20) |
6 | Transformed train set shape | (108, 20) |
7 | Transformed test set shape | (47, 20) |
8 | Ordinal features | 1 |
9 | Numeric features | 18 |
10 | Categorical features | 1 |
11 | Preprocess | True |
12 | Imputation type | simple |
13 | Numeric imputation | mean |
14 | Categorical imputation | mode |
15 | Maximum one-hot encoding | 25 |
16 | Encoding method | None |
17 | Fold Generator | StratifiedKFold |
18 | Fold Number | 10 |
19 | CPU Jobs | -1 |
20 | Use GPU | False |
21 | Log Experiment | False |
22 | Experiment Name | clf-default-name |
23 | USI | 1871 |
<pycaret.classification.oop.ClassificationExperiment at 0x1bbae0b95a0>
Compare Models
# compare baseline models
= compare_models() best
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
dt | Decision Tree Classifier | 0.8527 | 0.7625 | 0.9083 | 0.9182 | 0.9084 | 0.4756 | 0.4904 | 0.3550 |
rf | Random Forest Classifier | 0.8445 | 0.8486 | 0.9542 | 0.8663 | 0.9074 | 0.3947 | 0.4109 | 0.4180 |
gbc | Gradient Boosting Classifier | 0.8445 | 0.8556 | 0.9444 | 0.8778 | 0.9075 | 0.4177 | 0.4411 | 0.3870 |
et | Extra Trees Classifier | 0.8355 | 0.8514 | 0.9306 | 0.8760 | 0.9001 | 0.3912 | 0.4058 | 0.4180 |
lr | Logistic Regression | 0.8264 | 0.8500 | 0.9083 | 0.8819 | 0.8931 | 0.3996 | 0.4108 | 0.8670 |
ridge | Ridge Classifier | 0.8264 | 0.0000 | 0.9208 | 0.8724 | 0.8937 | 0.3839 | 0.3983 | 0.3510 |
lightgbm | Light Gradient Boosting Machine | 0.8264 | 0.8319 | 0.9306 | 0.8679 | 0.8949 | 0.3766 | 0.3902 | 0.4770 |
qda | Quadratic Discriminant Analysis | 0.8255 | 0.5847 | 0.9778 | 0.8371 | 0.8998 | 0.2293 | 0.2427 | 0.3560 |
ada | Ada Boost Classifier | 0.8164 | 0.8090 | 0.9097 | 0.8702 | 0.8873 | 0.3717 | 0.3846 | 0.3690 |
lda | Linear Discriminant Analysis | 0.8164 | 0.8431 | 0.8972 | 0.8799 | 0.8858 | 0.3743 | 0.3852 | 0.3560 |
dummy | Dummy Classifier | 0.7964 | 0.5000 | 1.0000 | 0.7964 | 0.8862 | 0.0000 | 0.0000 | 0.3560 |
knn | K Neighbors Classifier | 0.7527 | 0.6944 | 0.9083 | 0.8067 | 0.8523 | 0.0649 | 0.0666 | 0.3790 |
svm | SVM - Linear Kernel | 0.7409 | 0.0000 | 0.9097 | 0.7965 | 0.8393 | 0.0213 | 0.0239 | 0.3550 |
nb | Naive Bayes | 0.7345 | 0.8458 | 0.7125 | 0.9381 | 0.8076 | 0.4001 | 0.4358 | 0.3510 |
Create Model
# create model
= create_model('dt') dt
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
Fold | |||||||
0 | 0.7273 | 0.4444 | 0.8889 | 0.8000 | 0.8421 | -0.1379 | -0.1491 |
1 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
2 | 0.8182 | 0.8889 | 0.7778 | 1.0000 | 0.8750 | 0.5600 | 0.6236 |
3 | 0.8182 | 0.5000 | 1.0000 | 0.8182 | 0.9000 | 0.0000 | 0.0000 |
4 | 0.8182 | 0.8889 | 0.7778 | 1.0000 | 0.8750 | 0.5600 | 0.6236 |
5 | 0.7273 | 0.4444 | 0.8889 | 0.8000 | 0.8421 | -0.1379 | -0.1491 |
6 | 0.9091 | 0.9375 | 0.8750 | 1.0000 | 0.9333 | 0.7925 | 0.8101 |
7 | 0.9091 | 0.8333 | 1.0000 | 0.8889 | 0.9412 | 0.7442 | 0.7698 |
8 | 0.8000 | 0.6875 | 0.8750 | 0.8750 | 0.8750 | 0.3750 | 0.3750 |
9 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 | 1.0000 |
Mean | 0.8527 | 0.7625 | 0.9083 | 0.9182 | 0.9084 | 0.4756 | 0.4904 |
Std | 0.0936 | 0.2138 | 0.0841 | 0.0861 | 0.0554 | 0.4154 | 0.4251 |
# print model parameters
print(dt)
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
random_state=123, splitter='best')
Tune Model
# tune hyperparameters of rf
= tune_model(dt) tuned_dt
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
Fold | |||||||
0 | 0.9091 | 0.6111 | 1.0000 | 0.9000 | 0.9474 | 0.6207 | 0.6708 |
1 | 0.9091 | 0.9167 | 0.8889 | 1.0000 | 0.9412 | 0.7442 | 0.7698 |
2 | 0.9091 | 0.6944 | 1.0000 | 0.9000 | 0.9474 | 0.6207 | 0.6708 |
3 | 0.8182 | 0.5833 | 1.0000 | 0.8182 | 0.9000 | 0.0000 | 0.0000 |
4 | 0.7273 | 0.8611 | 0.7778 | 0.8750 | 0.8235 | 0.2326 | 0.2406 |
5 | 0.7273 | 0.7778 | 0.8889 | 0.8000 | 0.8421 | -0.1379 | -0.1491 |
6 | 0.8182 | 0.8750 | 0.8750 | 0.8750 | 0.8750 | 0.5417 | 0.5417 |
7 | 0.9091 | 0.7917 | 1.0000 | 0.8889 | 0.9412 | 0.7442 | 0.7698 |
8 | 0.8000 | 0.8750 | 0.7500 | 1.0000 | 0.8571 | 0.5455 | 0.6124 |
9 | 0.8000 | 0.9375 | 0.7500 | 1.0000 | 0.8571 | 0.5455 | 0.6124 |
Mean | 0.8327 | 0.7924 | 0.8931 | 0.9057 | 0.8932 | 0.4457 | 0.4739 |
Std | 0.0694 | 0.1188 | 0.1002 | 0.0691 | 0.0457 | 0.2921 | 0.3103 |
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
# to access the tuner object you can set return_tuner = True
= tune_model(dt, return_tuner=True) tuned_dt, tuner
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
Fold | |||||||
0 | 0.9091 | 0.6111 | 1.0000 | 0.9000 | 0.9474 | 0.6207 | 0.6708 |
1 | 0.9091 | 0.9167 | 0.8889 | 1.0000 | 0.9412 | 0.7442 | 0.7698 |
2 | 0.9091 | 0.6944 | 1.0000 | 0.9000 | 0.9474 | 0.6207 | 0.6708 |
3 | 0.8182 | 0.5833 | 1.0000 | 0.8182 | 0.9000 | 0.0000 | 0.0000 |
4 | 0.7273 | 0.8611 | 0.7778 | 0.8750 | 0.8235 | 0.2326 | 0.2406 |
5 | 0.7273 | 0.7778 | 0.8889 | 0.8000 | 0.8421 | -0.1379 | -0.1491 |
6 | 0.8182 | 0.8750 | 0.8750 | 0.8750 | 0.8750 | 0.5417 | 0.5417 |
7 | 0.9091 | 0.7917 | 1.0000 | 0.8889 | 0.9412 | 0.7442 | 0.7698 |
8 | 0.8000 | 0.8750 | 0.7500 | 1.0000 | 0.8571 | 0.5455 | 0.6124 |
9 | 0.8000 | 0.9375 | 0.7500 | 1.0000 | 0.8571 | 0.5455 | 0.6124 |
Mean | 0.8327 | 0.7924 | 0.8931 | 0.9057 | 0.8932 | 0.4457 | 0.4739 |
Std | 0.0694 | 0.1188 | 0.1002 | 0.0691 | 0.0457 | 0.2921 | 0.3103 |
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
tuned_dt
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=123, splitter='best')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=123, splitter='best')
tuner
RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False), error_score=nan, estimator=Pipeline(memory=FastMemory(location=C:\Users\JHossain\AppData\Local\Temp\joblib), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=No... 'actual_estimator__max_features': [1.0, 'sqrt', 'log2'], 'actual_estimator__min_impurity_decrease': [0, 0.0001, 0.001, 0.01, 0.0002, 0.002, 0.02, 0.0005, 0.005, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5], 'actual_estimator__min_samples_leaf': [2, 3, 4, 5, 6], 'actual_estimator__min_samples_split': [2, 5, 7, 9, 10]}, pre_dispatch='2*n_jobs', random_state=123, refit=False, return_train_score=False, scoring='accuracy', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False), error_score=nan, estimator=Pipeline(memory=FastMemory(location=C:\Users\JHossain\AppData\Local\Temp\joblib), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=No... 'actual_estimator__max_features': [1.0, 'sqrt', 'log2'], 'actual_estimator__min_impurity_decrease': [0, 0.0001, 0.001, 0.01, 0.0002, 0.002, 0.02, 0.0005, 0.005, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5], 'actual_estimator__min_samples_leaf': [2, 3, 4, 5, 6], 'actual_estimator__min_samples_split': [2, 5, 7, 9, 10]}, pre_dispatch='2*n_jobs', random_state=123, refit=False, return_train_score=False, scoring='accuracy', verbose=1)
Pipeline(memory=FastMemory(location=C:\Users\JHossain\AppData\Local\Temp\joblib), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=None, include=['AGE', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABL... transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))), ('actual_estimator', DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=123, splitter='best'))], verbose=False)
TransformerWrapperWithInverse(transformer=LabelEncoder())
LabelEncoder()
LabelEncoder()
TransformerWrapper(include=['AGE', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'BILIRUBIN', 'ALK PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME', 'HISTOLOGY'], transformer=SimpleImputer())
SimpleImputer()
SimpleImputer()
TransformerWrapper(include=['SEX'], transformer=SimpleImputer(strategy='most_frequent'))
SimpleImputer(strategy='most_frequent')
SimpleImputer(strategy='most_frequent')
TransformerWrapper(include=['SEX'], transformer=OrdinalEncoder(cols=['SEX'], handle_missing='return_nan', mapping=[{'col': 'SEX', 'data_type': dtype('float64'), 'mapping': 1.0 0 2.0 1 NaN -1 dtype: int64}]))
OrdinalEncoder(cols=['SEX'], handle_missing='return_nan', mapping=[{'col': 'SEX', 'data_type': dtype('float64'), 'mapping': 1.0 0 2.0 1 NaN -1 dtype: int64}])
OrdinalEncoder(cols=['SEX'], handle_missing='return_nan', mapping=[{'col': 'SEX', 'data_type': dtype('float64'), 'mapping': 1.0 0 2.0 1 NaN -1 dtype: int64}])
TransformerWrapper(transformer=CleanColumnNames())
CleanColumnNames()
CleanColumnNames()
DecisionTreeClassifier(random_state=123)
Analyze Model
# plot confusion matrix
= 'confusion_matrix') plot_model(dt, plot
# plot AUC
= 'auc') plot_model(dt, plot
# plot class report
= 'class_report') plot_model(dt, plot
# plot feature importance
= 'feature') plot_model(dt, plot
Evaluate Model
# evaluate model
evaluate_model(dt)
Finalize Model
# finalize a model
finalize_model(dt)
Pipeline(memory=FastMemory(location=C:\Users\JHossain\AppData\Local\Temp\joblib), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=None, include=['AGE', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABL... transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))), ('actual_estimator', DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=123, splitter='best'))], verbose=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(memory=FastMemory(location=C:\Users\JHossain\AppData\Local\Temp\joblib), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=None, include=['AGE', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABL... transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))), ('actual_estimator', DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=123, splitter='best'))], verbose=False)
TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())
LabelEncoder()
LabelEncoder()
TransformerWrapper(exclude=None, include=['AGE', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'BILIRUBIN', 'ALK PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME', 'HISTOLOGY'], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, keep_empty_features=False, missing_values=nan, strategy='mean', verbose='deprecated'))
SimpleImputer()
SimpleImputer()
TransformerWrapper(exclude=None, include=['SEX'], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, keep_empty_features=False, missing_values=nan, strategy='most_frequent', verbose='deprecated'))
SimpleImputer(strategy='most_frequent')
SimpleImputer(strategy='most_frequent')
TransformerWrapper(exclude=None, include=['SEX'], transformer=OrdinalEncoder(cols=['SEX'], drop_invariant=False, handle_missing='return_nan', handle_unknown='value', mapping=[{'col': 'SEX', 'data_type': dtype('float64'), 'mapping': 1.0 0 2.0 1 NaN -1 dtype: int64}], return_df=True, verbose=0))
OrdinalEncoder(cols=['SEX'], handle_missing='return_nan', mapping=[{'col': 'SEX', 'data_type': dtype('float64'), 'mapping': 1.0 0 2.0 1 NaN -1 dtype: int64}])
OrdinalEncoder(cols=['SEX'], handle_missing='return_nan', mapping=[{'col': 'SEX', 'data_type': dtype('float64'), 'mapping': 1.0 0 2.0 1 NaN -1 dtype: int64}])
TransformerWrapper(exclude=None, include=None, transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))
CleanColumnNames()
CleanColumnNames()
DecisionTreeClassifier(random_state=123)
Prediction
# predict on test set
= predict_model(dt) holdout_pred
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|---|
0 | Decision Tree Classifier | 0.6809 | 0.4689 | 0.8378 | 0.7750 | 0.8052 | -0.0698 | -0.0715 |
# show predictions df
holdout_pred.head()
AGE | SEX | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | ... | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | Class | prediction_label | prediction_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
122 | 42 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1 | ... | 2 | 1.0 | 85 | 40 | 0.0 | 0 | 2 | 1 | 2 | 1.0 |
44 | 34 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 0.0 | 0 | 86 | 0.0 | 0 | 1 | 1 | 2 | 1.0 |
112 | 52 | 1 | 1 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 1.0 | 85 | 30 | 4.0 | 0 | 2 | 1 | 2 | 1.0 |
75 | 32 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | ... | 2 | 1.0 | 55 | 45 | 4.0 | 56 | 1 | 1 | 2 | 1.0 |
150 | 46 | 1 | 2 | 2 | 1 | 1 | 1 | 2 | 2 | 2 | ... | 1 | 7.0 | 0 | 242 | 3.0 | 50 | 2 | 0 | 2 | 1.0 |
5 rows × 22 columns
# copy data and drop Class variable
= data.copy()
new_data 'Class', axis=1, inplace=True)
new_data.drop( new_data.head()
AGE | SEX | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | SPIDERS | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 1.0 | 85 | 18 | 4.0 | 0 | 1 |
1 | 50 | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 0.0 | 135 | 42 | 3.0 | 0 | 1 |
2 | 78 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 96 | 32 | 4.0 | 0 | 1 |
3 | 31 | 1 | 0 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 0.0 | 46 | 52 | 4.0 | 80 | 1 |
4 | 34 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1.0 | 0 | 200 | 4.0 | 0 | 1 |
# predict model on new_data
= predict_model(best, data = new_data)
predictions predictions.head()
AGE | SEX | STEROID | ANTIVIRALS | FATIGUE | MALAISE | ANOREXIA | LIVER BIG | LIVER FIRM | SPLEEN PALPABLE | ... | ASCITES | VARICES | BILIRUBIN | ALK PHOSPHATE | SGOT | ALBUMIN | PROTIME | HISTOLOGY | prediction_label | prediction_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | 2 | 1 | 2 | 2 | 2 | 2 | 1 | 2 | 2 | ... | 2 | 2 | 1.0 | 85 | 18 | 4.0 | 0 | 1 | 2 | 1.0 |
1 | 50 | 1 | 1 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | ... | 2 | 2 | 0.0 | 135 | 42 | 3.0 | 0 | 1 | 2 | 1.0 |
2 | 78 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 2 | 0.0 | 96 | 32 | 4.0 | 0 | 1 | 2 | 1.0 |
3 | 31 | 1 | 0 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 2 | 0.0 | 46 | 52 | 4.0 | 80 | 1 | 2 | 1.0 |
4 | 34 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | ... | 2 | 2 | 1.0 | 0 | 200 | 4.0 | 0 | 1 | 2 | 1.0 |
5 rows × 21 columns
Save Model
# save pipeline
'../models/hepatitis') save_model(dt,
Transformation Pipeline and Model Successfully Saved
(Pipeline(memory=FastMemory(location=C:\Users\JHossain\AppData\Local\Temp\joblib),
steps=[('label_encoding',
TransformerWrapperWithInverse(exclude=None, include=None,
transformer=LabelEncoder())),
('numerical_imputer',
TransformerWrapper(exclude=None,
include=['AGE', 'STEROID', 'ANTIVIRALS',
'FATIGUE', 'MALAISE', 'ANOREXIA',
'LIVER BIG', 'LIVER FIRM',
'SPLEEN PALPABL...
transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
('trained_model',
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
random_state=123, splitter='best'))],
verbose=False),
'../models/hepatitis.pkl')
# load pipeline
= load_model('../models/hepatitis')
loaded_best_pipeline loaded_best_pipeline
Transformation Pipeline and Model Successfully Loaded
Pipeline(memory=FastMemory(location=C:\Users\JHossain\AppData\Local\Temp\joblib), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=None, include=['AGE', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABL... transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))), ('trained_model', DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=123, splitter='best'))], verbose=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(memory=FastMemory(location=C:\Users\JHossain\AppData\Local\Temp\joblib), steps=[('label_encoding', TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())), ('numerical_imputer', TransformerWrapper(exclude=None, include=['AGE', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABL... transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))), ('trained_model', DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=123, splitter='best'))], verbose=False)
TransformerWrapperWithInverse(exclude=None, include=None, transformer=LabelEncoder())
LabelEncoder()
LabelEncoder()
TransformerWrapper(exclude=None, include=['AGE', 'STEROID', 'ANTIVIRALS', 'FATIGUE', 'MALAISE', 'ANOREXIA', 'LIVER BIG', 'LIVER FIRM', 'SPLEEN PALPABLE', 'SPIDERS', 'ASCITES', 'VARICES', 'BILIRUBIN', 'ALK PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME', 'HISTOLOGY'], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, keep_empty_features=False, missing_values=nan, strategy='mean', verbose='deprecated'))
SimpleImputer()
SimpleImputer()
TransformerWrapper(exclude=None, include=['SEX'], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, keep_empty_features=False, missing_values=nan, strategy='most_frequent', verbose='deprecated'))
SimpleImputer(strategy='most_frequent')
SimpleImputer(strategy='most_frequent')
TransformerWrapper(exclude=None, include=['SEX'], transformer=OrdinalEncoder(cols=['SEX'], drop_invariant=False, handle_missing='return_nan', handle_unknown='value', mapping=[{'col': 'SEX', 'data_type': dtype('float64'), 'mapping': 1.0 0 2.0 1 NaN -1 dtype: int64}], return_df=True, verbose=0))
OrdinalEncoder(cols=['SEX'], handle_missing='return_nan', mapping=[{'col': 'SEX', 'data_type': dtype('float64'), 'mapping': 1.0 0 2.0 1 NaN -1 dtype: int64}])
OrdinalEncoder(cols=['SEX'], handle_missing='return_nan', mapping=[{'col': 'SEX', 'data_type': dtype('float64'), 'mapping': 1.0 0 2.0 1 NaN -1 dtype: int64}])
TransformerWrapper(exclude=None, include=None, transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))
CleanColumnNames()
CleanColumnNames()
DecisionTreeClassifier(random_state=123)