import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
#import data
data = pd.read_csv(r'/Users/shivambadkas/Downloads/insurance.csv', sep=',')
original_data = data.copy(deep=True)
print(data.head())
print(data.shape)
age sex bmi children smoker region charges 0 19 female 27.900 0 yes southwest 16884.92400 1 18 male 33.770 1 no southeast 1725.55230 2 28 male 33.000 3 no southeast 4449.46200 3 33 male 22.705 0 no northwest 21984.47061 4 32 male 28.880 0 no northwest 3866.85520 (1338, 7)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 charges 1338 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 73.3+ KB
#check for duplicate values and drop if necessary
counter = 0
rs,cs = original_data.shape
data.drop_duplicates(inplace=True)
if data.shape==(rs,cs):
print('\n\033[1mInference:\033[0m The dataset doesn\'t have any duplicates')
else:
print(f'\n\033[1mInference:\033[0m Number of duplicates dropped/fixed ---> {rs-data.shape[0]}')
Inference: Number of duplicates dropped/fixed ---> 1
#check for empty values
nvc = pd.DataFrame(data.isnull().sum().sort_values(), columns=['Total Null Values'])
nvc['Percentage'] = round(nvc['Total Null Values']/data.shape[0],3)*100
print(nvc)
Total Null Values Percentage age 0 0.0 sex 0 0.0 bmi 0 0.0 children 0 0.0 smoker 0 0.0 region 0 0.0 charges 0 0.0
#check for correlation
print('\033[1mCorrelation Matrix'.center(100))
plt.figure(figsize=[10,8])
sns.heatmap(data.corr(), annot=True, cmap = 'ocean')
plt.show()
Correlation Matrix
#see distribution of y variable
sns.set(style = 'whitegrid')
f, a = plt.subplots(1,1, figsize=(12,12))
a = sns.distplot(data['charges'], kde = True, color = 'r')
plt.title('Distribution of Charges')
Text(0.5, 1.0, 'Distribution of Charges')
#apply log transformation in order to normalize it because it seems right skewed
f, a = plt.subplots(1, 1, figsize=(12, 8))
a = sns.distplot(np.log10(data['charges']), kde = True, color = 'r' )
#now that response variable is approximately normal, we can use subplots to visualize how the categories are related
#the first plot shows charges by region and gender, men generally pay more and costs are highest in the southeast
f, a = plt.subplots(1, 1, figsize=(12, 8))
a = sns.barplot(x='region', y='charges', hue='sex', data=data, palette='coolwarm')
#See charges by
f, ax = plt.subplots(1,1, figsize=(10,8))
ax = sns.barplot(x = 'region', y = 'charges',
hue='smoker', data=data, palette='coolwarm')
#families with more kids tend to have higher costs, although after 4 kids they seem lower
f, a = plt.subplots(1, 1, figsize=(10, 8))
ax = sns.barplot(x = 'region', y = 'charges',
hue='children', data=data, palette='Spectral')
#analysis of the effects of smoking with regards to age, bmi, and children on the effect of charges
#clearly smokers have higher insurance costs
ax = sns.lmplot(x = 'age', y = 'charges', data=data, hue='smoker', palette='copper')
ax = sns.lmplot(x = 'bmi', y = 'charges', data=data, hue='smoker', palette='icefire')
ax = sns.lmplot(x = 'children', y = 'charges', data=data, hue='smoker', palette='winter')
#Convert categorical data into numerical
data = pd.get_dummies(data)
#split into training and testing set:
#first create variables for features and
X= data.drop(['charges'], axis = 1)
y = data['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print('Original set ---> ',X.shape,y.shape,
'\nTraining set ---> ',X_train.shape,y_train.shape,
'\nTesting set ---> ', X_test.shape,'', y_test.shape)
Original set ---> (1337, 11) (1337,) Training set ---> (935, 11) (935,) Testing set ---> (402, 11) (402,)
#fit linear regression on training set. Use test set to determine accuracy of model
lr = LinearRegression()
lr.fit(X_train, y_train)
print(lr.intercept_)
print(lr.coef_)
print(lr.score(X_test, y_test))
-677.3688583661824 [ 251.24617223 328.38309611 522.15536198 55.95358044 -55.95358044 -11437.22320328 11437.22320328 541.85745202 76.11085558 -394.24187351 -223.72643409] 0.7724363518631284
#use L2 Regularization by applying ridge regression, basically adds a penalty for coefficient magnitude
from sklearn.linear_model import Ridge
Ridge = Ridge(alpha=0.5)
Ridge.fit(X_train, y_train)
print(Ridge.intercept_)
print(Ridge.coef_)
print(Ridge.score(X_test, y_test))
-685.0437283576284 [ 251.17376437 328.05509124 522.60655537 53.85667096 -53.85667096 -11403.31127882 11403.31127882 540.42120356 73.41430624 -389.50531015 -224.33019964] 0.7720116778863656
#use L1 Regularization by applying Lasso regression, which essentially sends unimportant features to 0
#we can see that OLS is a good model because the data is linear and the number of observations is
#greater than the number of features n>>p. Lasso is designed for observations < features for which OLS is unsuitable
from sklearn.linear_model import Lasso
Lasso = Lasso(alpha=0.9)
Lasso.fit(X_train, y_train)
print(Lasso.intercept_)
print(Lasso.coef_)
print(Lasso.score(X_test, y_test))
10488.112113030289 [ 2.51254377e+02 3.28157543e+02 5.21663671e+02 1.07519914e+02 -0.00000000e+00 -2.28683511e+04 3.89092360e-12 7.57945509e+02 2.92060761e+02 -1.69880942e+02 -0.00000000e+00] 0.7723874788075235
#Use random forest regressor
from sklearn.ensemble import RandomForestRegressor as rfr
Rfr = rfr(n_estimators = 100, criterion = 'mse',
random_state = 42,
n_jobs = -1)
Rfr.fit(x_train,y_train)
x_train_pred = Rfr.predict(X_train)
x_test_pred = Rfr.predict(X_test)
#rank feature importances, it seems smoking, bmi, and age are the most signifcant factors for insurance cost.
importance = pd.Series(data=Rfr.feature_importances_,
index= X_train.columns)
#sort
importance_sorted = importance.sort_values()
importance_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()