9 minute read

로지스틱 회귀모델을 이용해서 타이타닉호 승선객의 생존여부 확인하기

age 결측치 값은 다음과 같이 처리
  1. 생존자(survived = 1 )는 생존자 나이의 평균으로 대체
  2. 생존자(survived = 0 )는 사망자 나이의 평균으로 대체
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
titanic = pd.read_csv("train.csv")
titanic.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
titanic.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
#변수제거
titanic_cln = titanic.drop(["PassengerId","Name","Ticket","Cabin"],axis=1)
titanic_cln.head()
Survived Pclass Sex Age SibSp Parch Fare Embarked
0 0 3 male 22.0 1 0 7.2500 S
1 1 1 female 38.0 1 0 71.2833 C
2 1 3 female 26.0 0 0 7.9250 S
3 1 1 female 35.0 1 0 53.1000 S
4 0 3 male 35.0 0 0 8.0500 S
titanic_cln.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB
#결측치 처리 

#생존자 나이 평균 
sur1_mean = titanic_cln[titanic_cln["Survived"]==1]["Age"].mean()

#사망자 나이 평균 
sur0_mean = titanic_cln[titanic_cln["Survived"]==0]["Age"].mean()

print(sur1_mean, sur0_mean)
28.343689655172415 30.62617924528302
titanic_cln[titanic_cln["Survived"]==1]["Age"].fillna(sur1_mean)
titanic_cln[titanic_cln["Survived"]==0]["Age"].fillna(sur0_mean)
0      22.000000
4      35.000000
5      30.626179
6      54.000000
7       2.000000
         ...    
884    25.000000
885    39.000000
886    27.000000
888    30.626179
890    32.000000
Name: Age, Length: 549, dtype: float64
#결측치 처리는 loc로 
titanic_cln.loc[titanic_cln["Survived"]==1,"Age"] = titanic_cln[titanic_cln["Survived"]==1]["Age"].fillna(sur1_mean)
titanic_cln.loc[titanic_cln["Survived"]==0,"Age"] = titanic_cln[titanic_cln["Survived"]==0]["Age"].fillna(sur0_mean)
#age 부분 다 채워진거 확인
titanic_cln.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB
titanic_cln["Embarked"].value_counts()
S    644
C    168
Q     77
Name: Embarked, dtype: int64
print(titanic_cln["Embarked"].value_counts().index[0])
#s가 제일 많으니까 나머지 결측값에도 s를 넣어줄것
S
titanic_cln["Embarked"] = titanic_cln["Embarked"].fillna(titanic_cln["Embarked"].value_counts().index[0])
titanic_cln.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB
print(titanic_cln["Pclass"].value_counts())
print(titanic_cln["Sex"].value_counts())
print(titanic_cln["Embarked"].value_counts())
3    491
1    216
2    184
Name: Pclass, dtype: int64
0    577
1    314
Name: Sex, dtype: int64
S    646
C    168
Q     77
Name: Embarked, dtype: int64
#Pclass, sex, Emabarked 변수 타입 변환
#sex 를 female -> 1 male -> 0 으로 변환 
titanic_cln["Sex"] = titanic_cln["Sex"].replace(["female","male"],[1,0])
titanic_cln.head()
#sex 변수 변환됨
Survived Pclass Sex Age SibSp Parch Fare Embarked
0 0 3 0 22.0 1 0 7.2500 S
1 1 1 1 38.0 1 0 71.2833 C
2 1 3 1 26.0 0 0 7.9250 S
3 1 1 1 35.0 1 0 53.1000 S
4 0 3 0 35.0 0 0 8.0500 S
#3개의 카테고리로 만들어줄것
titanic_cln["Pclass"]=titanic_cln["Pclass"].astype("category") 
titanic_cln["Embarked"]=titanic_cln["Embarked"].astype("category") 
titanic_cln.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    int64   
 1   Pclass    891 non-null    category
 2   Sex       891 non-null    int64   
 3   Age       891 non-null    float64 
 4   SibSp     891 non-null    int64   
 5   Parch     891 non-null    int64   
 6   Fare      891 non-null    float64 
 7   Embarked  891 non-null    category
dtypes: category(2), float64(2), int64(4)
memory usage: 43.9 KB
titanic_cln_dum = pd.get_dummies(titanic_cln)
titanic_cln_dum.head()
Survived Sex Age SibSp Parch Fare Pclass_1 Pclass_2 Pclass_3 Embarked_C Embarked_Q Embarked_S
0 0 0 22.0 1 0 7.2500 0 0 1 0 0 1
1 1 1 38.0 1 0 71.2833 1 0 0 1 0 0
2 1 1 26.0 0 0 7.9250 0 0 1 0 0 1
3 1 1 35.0 1 0 53.1000 1 0 0 0 0 1
4 0 0 35.0 0 0 8.0500 0 0 1 0 0 1
titanic_cln_dum = sm.add_constant(titanic_cln_dum, has_constant="add")
titanic_cln_dum.head()
const const Survived Sex Age SibSp Parch Fare Pclass_1 Pclass_2 Pclass_3 Embarked_C Embarked_Q Embarked_S
0 1.0 1.0 0 0 22.0 1 0 7.2500 0 0 1 0 0 1
1 1.0 1.0 1 1 38.0 1 0 71.2833 1 0 0 1 0 0
2 1.0 1.0 1 1 26.0 0 0 7.9250 0 0 1 0 0 1
3 1.0 1.0 1 1 35.0 1 0 53.1000 1 0 0 0 0 1
4 1.0 1.0 0 0 35.0 0 0 8.0500 0 0 1 0 0 1
feature_columns = list(titanic_cln_dum.columns.difference(["Survived"]))


X = titanic_cln_dum[feature_columns]
y = titanic_cln_dum["Survived"]
x_train, x_test, y_train, y_test = train_test_split(X,y,
                                                   train_size=0.7, test_size =0.3,
                                                   random_state=102)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
(623, 13) (268, 13) (623,) (268,)
model = sm.Logit(y_train, x_train)
results = model.fit(method="newton")
Optimization terminated successfully.
         Current function value: 0.441385
         Iterations 7
results.summary()
Logit Regression Results
Dep. Variable: Survived No. Observations: 623
Model: Logit Df Residuals: 613
Method: MLE Df Model: 9
Date: Tue, 26 Jul 2022 Pseudo R-squ.: 0.3386
Time: 15:58:31 Log-Likelihood: -274.98
converged: True LL-Null: -415.74
Covariance Type: nonrobust LLR p-value: 2.171e-55
coef std err z P>|z| [0.025 0.975]
Age -0.0461 0.009 -4.941 0.000 -0.064 -0.028
Embarked_C 0.3284 4.11e+07 7.99e-09 1.000 -8.06e+07 8.06e+07
Embarked_Q 0.1857 nan nan nan nan nan
Embarked_S -0.3197 nan nan nan nan nan
Fare 0.0048 0.004 1.348 0.178 -0.002 0.012
Parch -0.0910 0.140 -0.649 0.516 -0.366 0.184
Pclass_1 1.0648 nan nan nan nan nan
Pclass_2 0.1762 nan nan nan nan nan
Pclass_3 -1.0466 nan nan nan nan nan
Sex 2.5486 0.239 10.675 0.000 2.081 3.017
SibSp -0.4101 0.136 -3.020 0.003 -0.676 -0.144
const 0.1944 3.45e+15 5.64e-17 1.000 -6.75e+15 6.75e+15
const 0.1944 1.25e+15 1.56e-16 1.000 -2.44e+15 2.44e+15
results.params
Age          -0.046104
Embarked_C    0.328389
Embarked_Q    0.185698
Embarked_S   -0.319655
Fare          0.004812
Parch        -0.091043
Pclass_1      1.064840
Pclass_2      0.176165
Pclass_3     -1.046573
Sex           2.548639
SibSp        -0.410117
const         0.194432
const         0.194432
dtype: float64
np.exp(results.params)
#pclass가 1인경우, 생존할 확률이 2배 높음
#변수의 범주가 나뉘지는 경우에는 카테고리로 나눠서 할것 
Age            0.954943
Embarked_C     1.388730
Embarked_Q     1.204058
Embarked_S     0.726400
Fare           1.004824
Parch          0.912979
Pclass_1       2.900376
Pclass_2       1.192634
Pclass_3       0.351139
Sex           12.789688
SibSp          0.663573
const          1.214621
const          1.214620
dtype: float64
results.aic
569.9662477300736
y_pred = results.predict(x_test)
y_pred
618    0.868298
849    0.954464
235    0.548751
865    0.715086
731    0.321682
         ...   
259    0.627825
760    0.089515
557    0.812292
638    0.358194
504    0.966472
Length: 268, dtype: float64
def PRED(y,threshold):
    Y= y.copy()
    Y[y > threshold] = 1
    Y[y <= threshold] = 0
    return(Y.astype(int))

Y_pred = PRED(y_pred,0.5)
Y_pred
618    1
849    1
235    1
865    1
731    0
      ..
259    1
760    0
557    1
638    0
504    1
Length: 268, dtype: int64
cfmat = confusion_matrix(y_test, Y_pred)
print(cfmat)
[[141  26]
 [ 26  75]]
def acc(cfmat):
    acc =(cfmat[0,0]+cfmat[1,1]/np.sum(cfmat))
    return(acc)

acc(cfmat)
141.27985074626866
threshold = np.arange(0,1,0.1)
table = pd.DataFrame(columns=["ACC"])

for i in threshold:
    Y_pred = PRED(y_pred,i)
    cfmat= confusion_matrix(y_test, Y_pred)
    table.loc[i] = acc(cfmat)
    
table.index.name="threshold"
table.columns.name="performance"
table

#0.6일때 정확도가 가장 높아짐
performance ACC
threshold
0.0 0.376866
0.1 52.358209
0.2 105.332090
0.3 121.313433
0.4 135.294776
0.5 141.279851
0.6 149.257463
0.7 161.190299
0.8 164.134328
0.9 166.082090
fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred, pos_label=1)
plt.plot(fpr,tpr)

auc=np.trapz(tpr,fpr)
print("AUC: ",auc)
AUC:  0.859370368174542

output_34_1

회귀계수 축소모형을 이용해서 타이타닉호 승선객의 생존여부 확인

from sklearn.linear_model import Ridge,Lasso,ElasticNet
lasso = Lasso(alpha = 0.01)
lasso.fit(x_train,y_train)
Lasso(alpha=0.01)
lasso.coef_
array([-0.00602402,  0.        ,  0.        , -0.05284511,  0.00103671,
       -0.00212009,  0.08610017, -0.        , -0.16710225,  0.42002796,
       -0.04601298,  0.        ,  0.        ])
pred_y_lasso = lasso.predict(x_test)
pred_Y_lasso = PRED(pred_y_lasso, 0.5)
cfmat = confusion_matrix(y_test, pred_Y_lasso)
print(acc(cfmat))
145.26865671641792
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred_y_lasso)
plt.plot(fpr,tpr)

auc=np.trapz(tpr,fpr)
print("AUC: ", auc)
AUC:  0.862571885931108

output_41_1

threshold = np.arange(0,1,0.1)
table = pd.DataFrame(columns = ["ACC"])

for i in threshold:
    Y_pred = PRED(pred_y_lasso, i)
    cfmat = confusion_matrix(y_test, Y_pred)
    table.loc[i]=acc(cfmat)
    
    
table.index.name = "threshold"
table.columns.name="performance"
table
performance ACC
threshold
0.0 2.376866
0.1 21.373134
0.2 93.332090
0.3 116.320896
0.4 137.298507
0.5 145.268657
0.6 161.235075
0.7 165.145522
0.8 166.097015
0.9 166.041045
alpha = np.logspace(-3,1,5)
alpha
array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01])
data = []
acc_table = []


for i,a in enumerate(alpha):
    lasso = Lasso(alpha=a).fit(x_train,y_train)
    data.append(pd.Series(np.hstack([lasso.intercept_, lasso.coef_])))
    y_pred = lasso.predict(x_test)
    y_pred =PRED(y_pred,0.5)
    cfmat = confusion_matrix(y_test,y_pred)
    acc_table.append((acc(cfmat)))
    
    
df_lasso = pd.DataFrame(data, index= alpha).T
df_lasso
#라소를 적용한 모형의 회귀계수
0.001 0.010 0.100 1.000 10.000
0 0.557221 0.519114 0.414021 0.314982 0.386838
1 -0.006679 -0.006024 -0.004045 -0.000000 -0.000000
2 0.018281 0.000000 0.000000 0.000000 0.000000
3 -0.000000 0.000000 0.000000 0.000000 -0.000000
4 -0.077428 -0.052845 -0.000000 -0.000000 -0.000000
5 0.000619 0.001037 0.002525 0.002147 0.000000
6 -0.014105 -0.002120 -0.000000 0.000000 0.000000
7 0.140224 0.086100 0.000000 0.000000 0.000000
8 -0.000000 -0.000000 0.000000 0.000000 0.000000
9 -0.190047 -0.167102 -0.000000 -0.000000 -0.000000
10 0.458285 0.420028 0.025301 0.000000 0.000000
11 -0.045830 -0.046013 -0.000000 -0.000000 -0.000000
12 0.000000 0.000000 0.000000 0.000000 0.000000
13 0.000000 0.000000 0.000000 0.000000 0.000000
acc_table_lasso = pd.DataFrame(acc_table, index= alpha).T
acc_table_lasso
#람다값이 0.01일때 정확도가 제일 높음
0.001 0.010 0.100 1.000 10.000
0 142.272388 145.268657 159.059701 162.048507 167.0

Ridge

ridge = Ridge(alpha = 0.01)
ridge.fit(x_train,y_train)
Ridge(alpha=0.01)
ridge.coef_
array([-0.00674968,  0.04090873,  0.01910686, -0.06001559,  0.00057219,
       -0.0154551 ,  0.16156199,  0.01545708, -0.17701907,  0.46253901,
       -0.04580177,  0.        ,  0.        ])
pred_y_ridge = ridge.predict(x_test)
pred_Y_ridge = PRED(pred_y_ridge, 0.5)
cfmat = confusion_matrix(y_test, pred_Y_ridge)
print(acc(cfmat))
142.27611940298507
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred_Y_ridge)
plt.plot(fpr,tpr)

auc=np.trapz(tpr,fpr)
print("AUC: ", auc)
AUC:  0.7914863342621689

output_51_1

threshold = np.arange(0,1,0.1)
table = pd.DataFrame(columns = ["ACC"])

for i in threshold:
    Y_pred = PRED(pred_Y_ridge, i)
    cfmat = confusion_matrix(y_test, Y_pred)
    table.loc[i]=acc(cfmat)
    
    
table.index.name = "threshold"
table.columns.name="performance"
table
performance ACC
threshold
0.0 142.276119
0.1 142.276119
0.2 142.276119
0.3 142.276119
0.4 142.276119
0.5 142.276119
0.6 142.276119
0.7 142.276119
0.8 142.276119
0.9 142.276119

ElasticNet

elastic = ElasticNet(alpha = 0.1, l1_ratio=0.5)
elastic.fit(x_train,y_train)
ElasticNet(alpha=0.1)
elastic.coef_
array([-0.00401088,  0.        ,  0.        , -0.        ,  0.0023124 ,
       -0.        ,  0.        ,  0.        , -0.00783898,  0.20741403,
       -0.02231027,  0.        ,  0.        ])
pred_y_elastic = elastic.predict(x_test)
pred_Y_elastic = PRED(pred_y_ridge, 0.5)
#0.5 기준으로 1과 0 판단
cfmat = confusion_matrix(y_test, pred_Y_elastic)
print(acc(cfmat))
142.27611940298507
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred_Y_elastic)
plt.plot(fpr,tpr)

auc=np.trapz(tpr,fpr)
print("AUC: ", auc)
AUC:  0.7914863342621689

output_58_1

Updated: