In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
In [3]:
#import data
data = pd.read_csv(r'/Users/shivambadkas/Downloads/insurance.csv', sep=',')
original_data = data.copy(deep=True)
In [4]:
print(data.head())
print(data.shape)
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
(1338, 7)
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
In [6]:
#check for duplicate values and drop if necessary
counter = 0
rs,cs = original_data.shape

data.drop_duplicates(inplace=True)

if data.shape==(rs,cs):
    print('\n\033[1mInference:\033[0m The dataset doesn\'t have any duplicates')
else:
    print(f'\n\033[1mInference:\033[0m Number of duplicates dropped/fixed ---> {rs-data.shape[0]}')
Inference: Number of duplicates dropped/fixed ---> 1
In [7]:
#check for empty values
nvc = pd.DataFrame(data.isnull().sum().sort_values(), columns=['Total Null Values'])
nvc['Percentage'] = round(nvc['Total Null Values']/data.shape[0],3)*100
print(nvc)
          Total Null Values  Percentage
age                       0         0.0
sex                       0         0.0
bmi                       0         0.0
children                  0         0.0
smoker                    0         0.0
region                    0         0.0
charges                   0         0.0
In [8]:
#check for correlation
print('\033[1mCorrelation Matrix'.center(100))
plt.figure(figsize=[10,8])
sns.heatmap(data.corr(), annot=True, cmap = 'ocean')
plt.show()
                                       Correlation Matrix                                       
In [9]:
#see distribution of y variable
sns.set(style = 'whitegrid')
f, a = plt.subplots(1,1, figsize=(12,12))
a = sns.distplot(data['charges'], kde = True, color = 'r')
plt.title('Distribution of Charges')
Out[9]:
Text(0.5, 1.0, 'Distribution of Charges')
In [10]:
#apply log transformation in order to normalize it because it seems right skewed
f, a = plt.subplots(1, 1, figsize=(12, 8))
a = sns.distplot(np.log10(data['charges']), kde = True, color = 'r' )
In [11]:
#now that response variable is approximately normal, we can use subplots to visualize how the categories are related
#the first plot shows charges by region and gender, men generally pay more and costs are highest in the southeast
f, a = plt.subplots(1, 1, figsize=(12, 8))
a = sns.barplot(x='region', y='charges', hue='sex', data=data, palette='coolwarm')
In [12]:
#See charges by 
f, ax = plt.subplots(1,1, figsize=(10,8))
ax = sns.barplot(x = 'region', y = 'charges',
                 hue='smoker', data=data, palette='coolwarm')
In [13]:
#families with more kids tend to have higher costs, although after 4 kids they seem lower
f, a = plt.subplots(1, 1, figsize=(10, 8))
ax = sns.barplot(x = 'region', y = 'charges',
                 hue='children', data=data, palette='Spectral')
In [14]:
#analysis of the effects of smoking with regards to age, bmi, and children on the effect of charges
#clearly smokers have higher insurance costs
ax = sns.lmplot(x = 'age', y = 'charges', data=data, hue='smoker', palette='copper')
ax = sns.lmplot(x = 'bmi', y = 'charges', data=data, hue='smoker', palette='icefire')
ax = sns.lmplot(x = 'children', y = 'charges', data=data, hue='smoker', palette='winter')
In [15]:
#Convert categorical data into numerical
data = pd.get_dummies(data)
In [16]:
#split into training and testing set:
#first create variables for features and 
X= data.drop(['charges'], axis = 1)
y = data['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print('Original set  ---> ',X.shape,y.shape,
      '\nTraining set  ---> ',X_train.shape,y_train.shape,
      '\nTesting set   ---> ', X_test.shape,'', y_test.shape)
Original set  --->  (1337, 11) (1337,) 
Training set  --->  (935, 11) (935,) 
Testing set   --->  (402, 11)  (402,)
In [18]:
#fit linear regression on training set. Use test set to determine accuracy of model
lr = LinearRegression()
lr.fit(X_train, y_train)
print(lr.intercept_)
print(lr.coef_)
print(lr.score(X_test, y_test))
-677.3688583661824
[   251.24617223    328.38309611    522.15536198     55.95358044
    -55.95358044 -11437.22320328  11437.22320328    541.85745202
     76.11085558   -394.24187351   -223.72643409]
0.7724363518631284
In [21]:
#use L2 Regularization by applying ridge regression, basically adds a penalty for coefficient magnitude
from sklearn.linear_model import Ridge
Ridge = Ridge(alpha=0.5)
Ridge.fit(X_train, y_train)
print(Ridge.intercept_)
print(Ridge.coef_)
print(Ridge.score(X_test, y_test))
-685.0437283576284
[   251.17376437    328.05509124    522.60655537     53.85667096
    -53.85667096 -11403.31127882  11403.31127882    540.42120356
     73.41430624   -389.50531015   -224.33019964]
0.7720116778863656
In [26]:
#use L1 Regularization by applying Lasso regression, which essentially sends unimportant features to 0
#we can see that OLS is a good model because the data is linear and the number of observations is 
#greater than the number of features n>>p. Lasso is designed for observations < features for which OLS is unsuitable
from sklearn.linear_model import Lasso
Lasso = Lasso(alpha=0.9)
Lasso.fit(X_train, y_train)
print(Lasso.intercept_)
print(Lasso.coef_)
print(Lasso.score(X_test, y_test))
10488.112113030289
[ 2.51254377e+02  3.28157543e+02  5.21663671e+02  1.07519914e+02
 -0.00000000e+00 -2.28683511e+04  3.89092360e-12  7.57945509e+02
  2.92060761e+02 -1.69880942e+02 -0.00000000e+00]
0.7723874788075235
In [37]:
#Use random forest regressor
from sklearn.ensemble import RandomForestRegressor as rfr
Rfr = rfr(n_estimators = 100, criterion = 'mse',
                              random_state = 42,
                              n_jobs = -1)
Rfr.fit(x_train,y_train)
x_train_pred = Rfr.predict(X_train)
x_test_pred = Rfr.predict(X_test)
In [34]:
#rank feature importances, it seems smoking, bmi, and age are the most signifcant factors for insurance cost. 
importance = pd.Series(data=Rfr.feature_importances_,
                        index= X_train.columns)
#sort
importance_sorted = importance.sort_values()

importance_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()
In [ ]:
 
In [ ]: