import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

import statsmodels.api as sm

origin_train = pd.read_csv("train.csv")  # 读入训练集文件

clean_train = origin_train.copy()  # 复制一份训练集，用于进行数据清洗

clean_train.head()  # 查看文件前5行

clean_train.info()  # 获取数据集概览

<class 'pandas.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    str    
 4   Sex          891 non-null    str    
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    str    
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    str    
 11  Embarked     889 non-null    str    
dtypes: float64(2), int64(5), str(5)
memory usage: 83.7 KB

clean_train[clean_train['Age'].isnull()] # 查看Age列存在缺失值的表格数据

clean_train['Age'] = clean_train['Age'].fillna(clean_train['Age'].mean())  # 用 Age 列数据的平均值填充 Age 列的 NaN 值
clean_train['Age'].isnull().sum()  # 验证 Age 列的 NaN 值是否已被填充

np.int64(0)

clean_train[clean_train['Cabin'].isnull()]  # 查看Cabin列存在缺失值的表格数据

# 构造一个新的列，有登记船舱信息记为1，无登记船舱信息记为0
clean_train['HasCabin'] = clean_train['Cabin'].notnull().astype("int")  # 构造出 HasCabin 列
clean_train[['Cabin','HasCabin']]

clean_train = clean_train.drop("Cabin",axis=1)  # 将无用的 Cabin 列从原表中删除
clean_train

clean_train[clean_train['Embarked'].isnull()]  # 查看Embarked列存在缺失值的表格数据

embarked_mode = clean_train['Embarked'].mode()  # 计算Embarked的众数
clean_train['Embarked'] = clean_train['Embarked'].fillna(embarked_mode[0])  # 把众数填充进缺失位置
clean_train['Embarked'].isnull().sum()  # 重新统计缺失值个数，确保正确填充缺失值

np.int64(0)

clean_train['PassengerId'] = clean_train['PassengerId'].astype("str")  # 把 PassengerId 列的数据类型设置为 str
clean_train.info()

<class 'pandas.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    str    
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    str    
 4   Sex          891 non-null    str    
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    str    
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    str    
 11  HasCabin     891 non-null    int64  
dtypes: float64(2), int64(5), str(5)
memory usage: 83.7 KB

clean_train['Survived'] = clean_train['Survived'].astype('category')  # 更改 Survived 列
clean_train['Survived']

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: category
Categories (2, int64): [0, 1]

clean_train['Pclass'] = clean_train['Pclass'].astype('category')  # 更改 Pclass 列
clean_train['Pclass']

0      3
1      1
2      3
3      1
4      3
      ..
886    2
887    1
888    3
889    1
890    3
Name: Pclass, Length: 891, dtype: category
Categories (3, int64): [1, 2, 3]

clean_train['Sex'] = clean_train['Sex'].astype('category')  # 更改 Sex 列
clean_train['Sex']

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: category
Categories (2, str): ['female', 'male']

clean_train['Embarked'] = clean_train['Embarked'].astype('category')  # 更改 Embarked 列
clean_train['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: category
Categories (3, str): ['C', 'Q', 'S']

clean_train['HasCabin'] = clean_train['HasCabin'].astype('category')  # 更改 HasCabin 列
clean_train['HasCabin']

0      0
1      1
2      0
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: HasCabin, Length: 891, dtype: category
Categories (2, int64): [0, 1]

clean_train["PassengerId"].duplicated().sum()

np.int64(0)

clean_train['Survived'].value_counts()  # 幸存结果分类

Survived
0    549
1    342
Name: count, dtype: int64

clean_train['Pclass'].value_counts()  # 船舱等级分类

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

clean_train['Sex'].value_counts()  # 性别分类

Sex
male      577
female    314
Name: count, dtype: int64

clean_train['Embarked'].value_counts()  # 登船港口分类

Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

clean_train['HasCabin'].value_counts()  #是否有登记船舱分类

HasCabin
0    687
1    204
Name: count, dtype: int64

clean_train.describe()

clean_train["FamilyNum"] = clean_train["SibSp"] + clean_train["Parch"]  # 家庭成员人数 = 同伴或同胞数 + 父母或孩子数
clean_train.head(10)

sns.set_palette("pastel")  # 设置图表的配色方案
plt.rcParams['figure.figsize'] = (7, 3.5)  # 设置全局图表宽、高
plt.rcParams["figure.autolayout"] = True  # 自动调整子图间距

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 黑体
plt.rcParams['axes.unicode_minus'] = False    # 解决负号 '-' 显示为方块

survived_count = clean_train['Survived'].value_counts()  # 统计Survived中每个类别的个数
survived_count

Survived
0    549
1    342
Name: count, dtype: int64

plt.pie(survived_count, labels=['遇难','幸存'], autopct="%.2f%%")
plt.show()

pclass_count_all = clean_train['Pclass'].value_counts()  # 统计不同等级船舱的人数
pclass_count_all

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

# 统计不同等级船舱的幸存人数
survived_df = clean_train[clean_train['Survived'] == 1]  # 筛选出所有幸存乘客信息
pclass_count_survived = survived_df['Pclass'].value_counts()  # 统计不同等级船舱的幸存人数
pclass_count_survived

Pclass
1    136
3    119
2     87
Name: count, dtype: int64

fig, axes = plt.subplots(1, 2)  # 绘制1行2列的子图图框
axes[0].pie(pclass_count_all, labels=['3', '1', '2'], autopct="%.2f%%")  # 绘制饼图，展示不同等级船舱的人数比例
sns.countplot(clean_train, x="Pclass", hue="Survived", ax=axes[1])  # 绘制计数图，展现不同船舱等级的人员的幸存人数对比【0 表示‘遇难’，1 表示‘幸存’】
plt.show()

sex_count_all = clean_train['Sex'].value_counts()  # 统计不同性别的人数
sex_count_all

Sex
male      577
female    314
Name: count, dtype: int64

# 统计不同性别的幸存人数
survived_df = clean_train[clean_train['Survived'] == 1]  # 筛选出所有幸存乘客信息
sex_count_survived = survived_df['Sex'].value_counts()  # 统计不同性别的幸存人数
sex_count_survived

Sex
female    233
male      109
Name: count, dtype: int64

fig, axes = plt.subplots(1, 2)  # 绘制1行2列的子图图框
axes[0].pie(sex_count_all, labels=['male', 'female'], autopct="%.2f%%")  # 绘制饼图，展示不同性别人数占比
sns.countplot(clean_train, x='Sex', hue='Survived', ax=axes[1])  # 绘制计数图，展示不同性别人员幸存人数对比【0 表示‘遇难’，1 表示‘幸存’】
plt.show()

fig, axes = plt.subplots(1, 2, gridspec_kw={"width_ratios": [1, 5]})  # 绘制1行2列的子图图框
sns.boxplot(clean_train['Age'], ax=axes[0])  # 绘制箱形图，用于展示乘客年龄的集中趋势
sns.histplot(clean_train, x='Age')  # 绘制直方图，用于展示乘客年龄的总体分布情况
plt.show()

sns.histplot(clean_train, x='Age', hue='Survived', alpha=0.4)  # 绘制直方图，展示不同年龄人员幸存人数对比【0 表示‘遇难’，1 表示‘幸存’】
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(10, 5), gridspec_kw={"width_ratios": [1, 4]})  # 绘制1行2列的子图图框
sns.boxplot(clean_train['Fare'], ax=axes[0])  # 绘制箱形图，用于展示票价金额的集中趋势
sns.histplot(clean_train, x="Fare")  # 绘制直方图，用于展示乘客船票金额的总体分布情况
plt.show()

sns.histplot(clean_train, x="Fare", hue="Survived")  # 绘制直方图，用于展示不同船票金额对应人员的幸存人数对比【0 表示‘遇难’，1 表示‘幸存’】
plt.show()

embarked_count_all = clean_train['Embarked'].value_counts()  # 统计不同登船港口乘客的人数
embarked_count_all

Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

# 统计不同登船港口乘客的幸存人数
survived_df = clean_train[clean_train['Survived'] == 1]  # 筛选出所有幸存乘客信息
embarked_count_survived = survived_df['Embarked'].value_counts()  # 统计不同登船港口乘客的幸存人数
embarked_count_survived

Embarked
S    219
C     93
Q     30
Name: count, dtype: int64

fig, axes = plt.subplots(1, 2)  # 绘制1行2列的子图图框
axes[0].pie(embarked_count_all, labels=['S', 'C', 'Q'], autopct='%.2f%%')  # 绘制饼图，用于表示不同港口登船的乘客的比例
sns.countplot(clean_train, x='Embarked', hue='Survived')  # 绘制计数图，用于展示不同登船港口对应乘客人数的幸存人数对比【0 表示‘遇难’，1 表示‘幸存’】
plt.show()

has_cabin_count_all = clean_train['HasCabin'].value_counts()  # 统计有登记船舱信息和无登记船舱信息的人员的个数
has_cabin_count_all

HasCabin
0    687
1    204
Name: count, dtype: int64

# 统计有登记船舱信息和无登记船舱信息的幸存人员的个数
survived_df = clean_train[clean_train['Survived'] == 1]  # 筛选出所有幸存乘客信息
has_cabin_count_survived = survived_df['HasCabin'].value_counts()  # 统计有登记船舱信息和无登记船舱信息的幸存人员的个数
has_cabin_count_survived

HasCabin
0    206
1    136
Name: count, dtype: int64

fig, axes = plt.subplots(1, 2)  # 绘制1行2列的子图图框
axes[0].pie(has_cabin_count_all, labels=['0', '1'], autopct='%.2f%%')  # 绘制饼图，用于展示有无登记船舱信息人员的比例【0 表示‘无登记’，1 表示‘有登记’】
sns.countplot(clean_train, x='HasCabin', hue='Survived')  # 绘制计数图，用于展示有无登记船舱信息人员的幸存人数对比【0 表示‘遇难’，1 表示‘幸存’】
plt.show()

family_num_count_all = clean_train['FamilyNum'].value_counts()  # 统计不同家庭成员数量乘客的人数
family_num_count_all

FamilyNum
0     537
1     161
2     102
3      29
5      22
4      15
6      12
10      7
7       6
Name: count, dtype: int64

# 统计不同家庭成员数量乘客的幸存人数
survived_df = clean_train[clean_train['Survived'] == 1]  # 筛选出所有幸存乘客信息
family_num_count_survived = survived_df['FamilyNum'].value_counts()  # 统计不同家庭成员数量乘客的幸存人数
family_num_count_survived

FamilyNum
0    163
1     89
2     59
3     21
6      4
5      3
4      3
Name: count, dtype: int64

fig, axes = plt.subplots(1, 2)  # 绘制1行2列的子图图框
axes[0].pie(family_num_count_all, labels=['0', '1', '2', '3', '5', '4', '6', '10', '7'])  # 绘制饼图，用于展示不同家庭成员数量的乘客的比例
sns.countplot(clean_train, x='FamilyNum', hue='Survived')  # 绘制计数图，用于展示不同家庭成员数量乘客的幸存人数对比【0 表示‘遇难’，1 表示‘幸存’】
plt.show()

lr_titanic_train = clean_train.copy()  # 备份数据集
lr_titanic_train.head()

lr_titanic_train = lr_titanic_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1)  # 删除'PassengerId'、'Name'和'Ticket'三列
lr_titanic_train.head()

lr_titanic_train = pd.get_dummies(lr_titanic_train, columns=['Pclass', 'Sex', 'Embarked', 'HasCabin'], dtype=int, drop_first=True)
lr_titanic_train.head()

y = lr_titanic_train['Survived']  # 提取出因变量列
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: category
Categories (2, int64): [0, 1]

X = lr_titanic_train.drop('Survived', axis=1)  # 把表示因变量的'Survived'列移除，其他列都是自变量列
X.head()

sns.heatmap(X.corr().abs(), annot=True)  # 用热力图直观展示变量间相关性的大小
plt.show()

X = X.drop(['SibSp'], axis=1)  # 删除强相关特征
X.head()

X = X.drop(['Fare', 'Parch', 'Pclass_2','Embarked_Q'], axis=1)  # 删除对模型无显著预测效果的特征
X.head()

X = X.drop(['Embarked_S'], axis=1)  # 删除对模型无显著预测效果的特征【后加的】
X.head()

X = sm.add_constant(X)
X.head()

model = sm.Logit(y, X)  # 建立模型
result = model.fit()  # 拟合模型
result.summary()  # 输出模型拟合结果

Optimization terminated successfully.
         Current function value: 0.440651
         Iterations 6

print(f"年龄每增加1岁，生还几率降低 {(1-np.exp(-0.0376))*100:.2f}%")
print(f"每多1名同乘家庭成员，生还几率降低 {(1-np.exp(-0.2301))*100:.2f}%")
print(f"三等舱乘客的生还几率比一等舱乘客低 {(1-np.exp(-1.2214))*100:.2f}%")
print(f"男性乘客的生还几率比女性乘客低 {(1-np.exp(-2.7654))*100:.2f}%")
print(f"有登记船舱信息的乘客生还几率比无登记乘客高 {(np.exp(1.2729)-1)*100:.2f}%")

年龄每增加1岁，生还几率降低 3.69%
每多1名同乘家庭成员，生还几率降低 20.55%
三等舱乘客的生还几率比一等舱乘客低 70.52%
男性乘客的生还几率比女性乘客低 93.70%
有登记船舱信息的乘客生还几率比无登记乘客高 257.12%

origin_test = pd.read_csv('test.csv')  # 读取测试集 CSV 文件
origin_test.head()

clean_test = origin_test.copy()  # 将原始数据集复制一份，以防改错
clean_test.head()

clean_test = clean_test.drop(['Fare', 'PassengerId', 'Name', 'Ticket'], axis=1)
clean_test.head()

clean_test.info()

<class 'pandas.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    str    
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Cabin     91 non-null     str    
 6   Embarked  418 non-null    str    
dtypes: float64(1), int64(3), str(3)
memory usage: 23.0 KB

clean_test['Age'] = clean_test['Age'].fillna(clean_test['Age'].mean())  # 用 Age 列数据的平均值填充 Age 列的 NaN 值
clean_test['Age'].isnull().sum()  # 验证 Age 列的 NaN 值是否已被填充

np.int64(0)

# 构造一个新的列，有登记船舱信息记为1，无登记船舱信息记为0
clean_test['HasCabin'] = clean_test['Cabin'].notnull().astype("int")  # 构造出 HasCabin 列
clean_test[['Cabin','HasCabin']]

clean_test['Pclass'] = clean_test['Pclass'].astype('category')  # 更改 Pclass 列
clean_test['Pclass']

0      3
1      3
2      2
3      3
4      3
      ..
413    3
414    1
415    3
416    3
417    3
Name: Pclass, Length: 418, dtype: category
Categories (3, int64): [1, 2, 3]

clean_test['Sex'] = clean_test['Sex'].astype('category')  # 更改 Pclass 列
clean_test['Sex']

0        male
1      female
2        male
3        male
4      female
        ...  
413      male
414    female
415      male
416      male
417      male
Name: Sex, Length: 418, dtype: category
Categories (2, str): ['female', 'male']

clean_test['Embarked'] = clean_test['Embarked'].astype('category')  # 更改 Pclass 列
clean_test['Embarked']

0      Q
1      S
2      Q
3      S
4      S
      ..
413    S
414    C
415    S
416    S
417    C
Name: Embarked, Length: 418, dtype: category
Categories (3, str): ['C', 'Q', 'S']

clean_test["FamilyNum"] = clean_test["SibSp"] + clean_test["Parch"]  # 家庭成员人数 = 同伴或同胞数 + 父母或孩子数
clean_test.head(10)

clean_test = pd.get_dummies(clean_test, columns=['Pclass', 'Sex', 'Embarked', 'HasCabin'], dtype=int, drop_first=True)
clean_test.head()

clean_test = clean_test.drop(['SibSp', 'Parch', 'Cabin', 'Pclass_2', 'Embarked_Q', 'Embarked_S'], axis=1)
clean_test.head()

clean_test = sm.add_constant(clean_test)
clean_test.head()

clean_test.info()

<class 'pandas.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   const       418 non-null    float64
 1   Age         418 non-null    float64
 2   FamilyNum   418 non-null    int64  
 3   Pclass_3    418 non-null    int64  
 4   Sex_male    418 non-null    int64  
 5   HasCabin_1  418 non-null    int64  
dtypes: float64(2), int64(4)
memory usage: 19.7 KB

result_test = result.predict(clean_test)  # 对测试集数据进行预测
result_test

0      0.080819
1      0.409658
2      0.095998
3      0.104366
4      0.585002
         ...   
413    0.093425
414    0.934593
415    0.070340
416    0.093425
417    0.061072
Length: 418, dtype: float64

# 定义‘预测概率>0.61’为‘幸存’，否则为‘遇难’
result_test = result_test > 0.61  # 把预测结果转为布尔值
result_test = result_test.astype(int)  # 把预测结果的布尔值转为整数类型的0和1【0：遇难】【1：幸存】
result_test

0      0
1      0
2      0
3      0
4      0
      ..
413    0
414    1
415    0
416    0
417    0
Length: 418, dtype: int64

# 构建测试结果 DataFrame
result_test_dataframe = pd.DataFrame({'PassengerId':origin_test['PassengerId'], 'Survived':result_test})
result_test_dataframe.head()

result_test_dataframe.to_csv("result_test.csv", index=False)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	891.000000
mean	29.699118	0.523008	0.381594	32.204208
std	13.002015	1.102743	0.806057	49.693429
min	0.420000	0.000000	0.000000	0.000000
25%	22.000000	0.000000	0.000000	7.910400
50%	29.699118	0.000000	0.000000	14.454200
75%	35.000000	1.000000	0.000000	31.000000
max	80.000000	8.000000	6.000000	512.329200

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Embarked	HasCabin	FamilyNum
0	1	0	3	Braund, Mr. Owen Harris	male	22.000000	1	0	A/5 21171	7.2500	S	0	1
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.000000	1	0	PC 17599	71.2833	C	1	1
2	3	1	3	Heikkinen, Miss. Laina	female	26.000000	0	0	STON/O2. 3101282	7.9250	S	0	0
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.000000	1	0	113803	53.1000	S	1	1
4	5	0	3	Allen, Mr. William Henry	male	35.000000	0	0	373450	8.0500	S	0	0
5	6	0	3	Moran, Mr. James	male	29.699118	0	0	330877	8.4583	Q	0	0
6	7	0	1	McCarthy, Mr. Timothy J	male	54.000000	0	0	17463	51.8625	S	1	0
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.000000	3	1	349909	21.0750	S	0	4
8	9	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27.000000	0	2	347742	11.1333	S	0	2
9	10	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14.000000	1	0	237736	30.0708	C	0	1

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Embarked	HasCabin	FamilyNum
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	S	0	1
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C	1	1
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	S	0	0
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	S	1	1
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	S	0	0

	const	Age	FamilyNum	Pclass_3	Sex_male	HasCabin_1
0	1.0	22.0	1	1	1	0
1	1.0	38.0	1	0	0	1
2	1.0	26.0	0	1	0	0
3	1.0	35.0	1	0	0	1
4	1.0	35.0	0	1	1	0

	Cabin	HasCabin
0	NaN	0
1	C85	1
2	NaN	0
3	C123	1
4	NaN	0
...	...	...
886	NaN	0
887	B42	1
888	NaN	0
889	C148	1
890	NaN	0

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Embarked	HasCabin
61	62	1	1	Icard, Miss. Amelie	female	38.0	0	0	113572	80.0	NaN	1
829	830	1	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	0	113572	80.0	NaN	1

Dep. Variable:	Survived	No. Observations:	891
Model:	Logit	Df Residuals:	885
Method:	MLE	Df Model:	5
Date:	Mon, 13 Apr 2026	Pseudo R-squ.:	0.3383
Time:	15:21:59	Log-Likelihood:	-392.62
converged:	True	LL-Null:	-593.33
Covariance Type:	nonrobust	LLR p-value:	1.470e-84

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	2.8511	0.345	8.254	0.000	2.174	3.528
Age	-0.0376	0.008	-4.852	0.000	-0.053	-0.022
FamilyNum	-0.2301	0.066	-3.501	0.000	-0.359	-0.101
Pclass_3	-1.2214	0.214	-5.716	0.000	-1.640	-0.803
Sex_male	-2.7654	0.199	-13.907	0.000	-3.155	-2.376
HasCabin_1	1.2729	0.244	5.209	0.000	0.794	1.752

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

	Cabin	HasCabin
0	NaN	0
1	NaN	0
2	NaN	0
3	NaN	0
4	NaN	0
...	...	...
413	NaN	0
414	C105	1
415	NaN	0
416	NaN	0
417	NaN	0

项目：泰坦尼克沉船事件乘客幸存预测¶

事件背景¶

项目目标¶

数据描述¶

结论前置¶

读取数据¶

数据评估与清洗¶

数据整齐度评估¶

数据干净度评估¶

数据干净度分析¶

缺失值处理¶

Age缺失值处理¶

Cabin缺失值处理¶

Embarked 缺失值处理¶

处理异常类型¶

将 PassengerId 转为字符串类型¶

Survived、Pclass、Sex、Embarked、HasCabin转为分类类型¶

数据重复评估¶

数据一致性评估¶

无效或错误数据评估¶

特征扩展¶

数据探索¶

探索幸存比例¶

探索船舱等级与是否幸存之间的相关关系¶

探索性别与是否幸存之间的相关关系¶

探索乘客的年龄分布情况¶

探索年龄与是否幸存之间的相关关系¶

探索乘客的年龄分布情况¶

探索票价金额与是否幸存之间的相关关系¶

探索登船港口与是否幸存之间的相关关系¶

探索是否有登记船舱信息与是否幸存之间的相关关系¶

探索家庭成员数量与是否幸存之间的相关关系¶

分析数据¶

构建逻辑回归模型¶

模型结果总结¶

使用逻辑回归模型¶

导入测试集数据¶

对测试集数据进行预处理¶

把已知的不需要用到的特征列删掉¶

评估剩下的特征¶

构造新特征¶

把因变量的分类变量构造成虚拟变量¶

删除无关变量¶

为了保证截距不会被模型忽略，需要给自变量表格增加一个自变量列，其值全为 1¶

再看一眼概览，确保数据集正确清洗完成¶

使用预处理后的测试集数据进行模型预测¶

构建要上传到竞赛平台的结果数据集¶

保存测试集的预测结果¶