逻辑回归算是机器学习中最基础的模型了，回归模型在做分类问题中有着较好的效果。下面介绍下利用sklearn做逻辑回归模型

说到逻辑回归就不得不提一下线性回归，线性回归用wiki百科的定义来解释就是：在统计学中，线性回归是一种用来建立响应标量（因变量）和一个或多个解释变量（自变量）之间的模型关系的线性方法。线性回归分为一元线性回归和多元线性回归。均方误差是回归模型中常用的度量方法。一般用最小二乘法来最小化均方误差。

1.在数据集从kaggle中下载后我们先读取数据和数据预览：

import pandas as pd
#read test data print(train.info()) #show the information about train
data,including counting values of null

2.了解数据：

print(train.isnull().sum()

发现数据中缺失数据Age有177个，Cabin 有687个，Embarked 有2个；由于Cabin

c=train.Cabin.value_counts() #get the value of Cabin print(c)
train.drop(labels=Cabin,axis=1)
3.数据处理：

train.Embarked=train.Embarked.fillna('S') Em=train.Embarked.value_counts()
print(Em)

train['cc']=train.Name.map(lambda x:
str(re.compile(r',(.*)\.').findall(x)))#获取名字中的简称字样Mr,Miss,Mrs,Master,Dr等值
#替换上面的写法：train['cc']=train['Name'].apply(lambda
x:x.split(',')[1].split('.')[0].strip())
c=train.loc[:,['cc','Age']].query('Age>0').groupby('cc').mean() #按照名称辅助列看下各年龄的均值

train['Age']=train['Age'].fillna(0)#先对缺失值进行0填充 for i in range(1,891): if
train['Age'][i]==0 and train['cc'][i]=="[' Mr']": train.loc[i, 'Age']=32 if
train['Age'][i]==0 and train['cc'][i] =="[' Mrs']": train.loc[i, 'Age']= 35 if
train['Age'][i]==0 and train['cc'][i] == "[' Miss']": train.loc[i, 'Age']=20 if
train['Age'][i]==0 and train['cc'][i] == "[' Master']": train.loc[i, 'Age']= 4
if train['Age'][i]==0 and train['cc'][i] == "[' Dr']": train.loc[i,'Age']=42

value=['Mr','Miss','Mrs','Master','Dr'] for v in value:
train.loc[(train.Age==0)&(train.cc==v),'Age']=c.loc[v,'Age']

categore=train.dtypes[train.dtypes=='object'].index

train=train.replace({'Sex':{'male':1,'female':2},
'Embarked':{'S':1,'C':2,'Q':3}} )

data=data.drop(labels=['cc','Name','Ticket'],axis=1)

data=pd.concat([train,test],keys=(['train','test']))

train_data=data.xs('train')#分开得到level 为train的测试数据
test_data=data.xs('test').drop(labels='Survived',axis=1)
x_train=train_data.drop(labels='Survived',axis=1)
y_train=train_data['Survived'] test_data=test_data.fillna(0)
4.选择模型

from sklearn.preprocessing import StandardScaler from sklearn.linear_model
import LogisticRegression S=StandardScaler() S.fit(x_train)
x_train_stand=S.transform(x_train) x_test_stand=S.transform(test_data)
Log=LogisticRegression(C=10) Log.fit(x_train_stand,y_train) #训练模型
prediction=Log.predict(x_test_stand) #用训练的模型Log来预测测试数据
result=pd.DataFrame({'PassengerId':test_data.index,'Survived':prediction.astype(np.int32)})
#这里需要注意把prediction的数据转换成Int型不然系统判定不了，得分会为0
result.to_csv('D:\\pycm\\kaggle\\titanic\\result.csv',index=False) #设置不输出Index

import pandas as pd import numpy as np import seaborn as sns import matplotlib
import matplotlib.pyplot as plt import re from sklearn.preprocessing import
StandardScaler from sklearn.linear_model import LogisticRegression
data=pd.concat([train,test],keys=(['train','test'])) print(data.info())
data.Embarked=data.Embarked.fillna('S') data=data.drop(labels='Cabin',axis=1)
#data['cc']=data.Name.map(lambda x: str(re.compile(r',(.*)\.').findall(x)))
data['cc']=data['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
c=data.loc[:,['cc','Age']].query('Age>0').groupby('cc').mean()
print(c.loc['Miss','Age']) value=['Mr','Miss','Mrs','Master','Dr']
data['Age']=data['Age'].fillna(0) for v in value:
data.loc[(data.Age==0)&(data.cc==v),'Age']=c.loc[v,'Age']
data=data.drop(labels=['cc','Name','Ticket'],axis=1)
data=data.replace({'Sex':{'male':1,'female':2}, 'Embarked':{'S':1,'C':2,'Q':3}}
) train_data=data.xs('train')
test_data=data.xs('test').drop(labels='Survived',axis=1)
x_train=train_data.drop(labels='Survived',axis=1)
y_train=train_data['Survived'] test_data=test_data.fillna(0) S=StandardScaler()
S.fit(x_train) x_train_stand=S.transform(x_train)
x_test_stand=S.transform(test_data)
Log=RandomForestClassifier(oob_score=True,random_state=10)
Log.fit(x_train_stand,y_train) prediction=Log.predict(x_test_stand)
result=pd.DataFrame({'PassengerId':test_data.index,'Survived':prediction.astype(np.int32)})
result.to_csv('D:\\pycm\\kaggle\\titanic\\result.csv',index=False)

ioDraw流程图
API参考文档
OK工具箱