核心思路:

先用LabelEncoder对离散特征编码,因为onehotencoder只能处理数值

然后使用OneHotEncoder编码,生成稀疏表示的特征

再使用sparse.hstack连接连续特征和稀疏特征

为什么不使用pd.get_dummy呢,因为这样是直接生成的稠密矩阵,内存开销太大


# coding=utf-8 # @author: bryan from sklearn.preprocessing import LabelEncoder
fromsklearn.preprocessing import OneHotEncoder from scipy import sparse for
featurein cate_feature + con_feature: data[feature] =
LabelEncoder().fit_transform(data[feature].values) enc = OneHotEncoder()
train_x=train[numeric_feature] test_x=test[numeric_feature]for feature in
cate_feature+con_feature: enc.fit(data[feature].values.reshape(-1, 1))
train_a=enc.transform(train[feature].values.reshape(-1, 1)) test_a =
enc.transform(test[feature].values.reshape(-1, 1)) train_x=
sparse.hstack((train_x, train_a)) test_x = sparse.hstack((test_x, test_a)) #
文本one hotfrom sklearn.feature_extraction.text import CountVectorizer #
每行用空格join起来data['corpus']=data['corpus'].apply(lambda x:' '.join(x.split(';')))
#如果corpus里面是数字,可能会提示empty vocabulary; perhaps the documents only contain stop
words#改成这样就行了CountVectorizer(token_pattern='(?u)\\b\\w+\\b') property_feature =
CountVectorizer().fit_transform(data['corpus'])
train_x=sparse.hstack((train_property_feature,train_x))



友情链接
KaDraw流程图
API参考文档
OK工具箱
云服务器优惠
阿里云优惠券
腾讯云优惠券
华为云优惠券
站点信息
问题反馈
邮箱:ixiaoyang8@qq.com
QQ群:637538335
关注微信