1. 普通线性回归:通过输出模型的真实值和预测值的平均平方差尽可能小(即最小二乘估计法),但容易陷入过度拟合(即低偏差),后续回归方法会有带正则化法来缩减数据。
2. 普通线性回归+RFE:RFE是recursive feature
elimination回归特征消除,让回归特征消除过程中只保留no_features个最重要的特征,可以避免过度拟合,但RFE会舍弃一些变量,原没有下面几个方法给变量赋权重来的好。
3. L2缩减回归 - 岭回归:正则化那块采用L2范式,alpha越大,缩减幅度越大。岭回归比LASSO的预测能力好点,但LASSO能完成动态选择。
4. L1缩减回归 - LASSO:Least absolute shrinkage and selection
operator最小绝对值缩减和选择操作,LASSO更偏向于稀疏的结果,如果一个结果大多数系数被压缩为0,那么它被称为系数的,LASSO大多数的系数都变成0了,对相关联的变量,只选择保留一个。
RFE:
# -*- coding: utf-8 -*- """ Created on Thu Apr 05 19:52:39 2018 @author: Alvin
AI """ from sklearn.datasets import load_boston from sklearn.cross_validationi
import train_test_split from sklearn.linear_model import LinearRegression from
sklearn.metrics import mean_squared_error import matplotlib.pyplot as plt from
sklearn.preprocessing import PolynomialFeatures from itertools import
combinations from sklearn.feature_selection import RFE #载入数据 def get_data():
data = load_boston() x = data['data'] y = data['target'] return x,y #建立模型
#让回归特征消除(RFE-recursive feature elimination)只保留no_features个最重要的特征 def
build_model(x,y,no_features): model =
LinearRegression(normalize=True,fit_intercept=True) rfe_model =
RFE(estimator=model,n_features_to_select=no_features) rfe_model.fit(x,y) return
rfe_model #查看模型 def view_model(model): print "\nmodel coefficients" print
"===================\n" #coef_提供了一个系数矩阵,intercept_提供了回归常数 for i,coef in
enumerate(model.coef_): print "\t coefficient %d %model"%(i+1,coef) print
"\n\tintercept %0.3f"%(model.intercept_) #计算均平方差用以评估模型误差 def
model_worth(true_y,predicted_y): print "\t mean squared error =
%0.2f"%(mean_squared_error(true_y,predicted_y)) return
mean_squared_error(true_y,predicted_y) #绘制残差图 def plot_residual(y,predicted_y):
plt.cla() plt.xlabel('predicted y') plt.ylabel('residual') plt.title('residual
plot') plt.figure1(1) diff = y - predicted_y plt.plot(predicted_y,diff,'go')
plt.show() if __name__=="__main__": x,y = get_data() #划分数据集
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size=0.3,random_state=9) x_dev,x_test,y_dev,y_test =
train_test_split(x_test_all,y_test_all,\ test_size=0.3,random_state=9)
#准备一些多项式特征 poly_features =
PolynomialFeatures(interaction_only=True)#只有x1和x2交互一起的,x1^2这种不行 x_train_poly =
poly_features.fit_transform(x_train) x_dev_poly =
poly_features.fit_transform(x_dev) choosen_model =
build_model(x_train_poly,y_train,20) predicted_y =
choosen_model.predict(x_train_poly) mse = model_worth(y_train,predicted_y)
x_test_poly = poly_features.fit_transform(x_test) predicted_y =
choosen_model.predict(x_test_poly) model_worth(y_test,predicted_y)LASSO:
# -*- coding: utf-8 -*- """ Created on Mon Apr 09 09:08:51 2018 @author: Alvin
AI """ from sklearn.datasets import load_boston from sklearn.model_selection
import train_test_split from sklearn.linear_model import Lasso,
LinearRegression from sklearn.metrics import mean_squared_error from
sklearn.preprocessing import PolynomialFeatures import matplotlib.pyplot as plt
import numpy as np #加载数据 def get_data(): data = load_boston() x = data['data']
y = data['target'] return x,y #建立模型 def build_models(x,y): alpha_range =
np.linspace(0,0.5,200) model = Lasso(normalize=True)#只需要标准化,不需要中心化
coeffiecients = [] #对每个alpha值适配模型 for alpha in alpha_range:
model.set_params(alpha=alpha) model.fit(x,y)
coeffiecients.append(model.coef_)#追踪系数用来绘图 #print coeffiecients #维度为200*13
#绘制系数权重变化和对应的alpha值 #绘制模型的RMSE和对应的alpha值 coeff_path(alpha_range,coeffiecients)
#查看系数值 #view_model(model) #查看回归系数值 def view_model(model): print "\n model
coeffiecients" print "======================" for i,coef in
enumerate(model.coef_): print "\t coefficient %d %0.3f" % (i+1,coef) print
"\n\t intercept %0.3f" % (model.intercept_) #评估模型 def
model_worth(true_y,predicted_y): print "\t mean squared error = %0.2f\n" % \
(mean_squared_error(true_y,predicted_y)) #绘制不同alpha值情况下的系数权重 def
coeff_path(alpha_range,coeffiecients): plt.close('all') plt.cla() plt.figure(1)
plt.xlabel("Alpha Values") plt.ylabel("coeffiecient weights for different alpha
values") plt.plot(alpha_range,coeffiecients)
plt.axis('tight')#修改x、y坐标的范围让所有的数据显示出来 plt.show() #主函数调用,查看保留下来的回归系数有哪些 def
get_coef(x,y,alpha): model = Lasso(normalize=True,alpha=alpha) model.fit(x,y)
coefs = model.coef_ indices = [i for i,coef in enumerate(coefs) if abs(coef) >
0.0] return indices #电泳所有函数 if __name__ == "__main__": x,y = get_data()
#用不用的alpha值多次建模,并绘出图形 build_models(x,y) print "\npredicting using all the
variables\n" full_model = LinearRegression(normalize=True) full_model.fit(x,y)
predicted_y = full_model.predict(x) model_worth(y,predicted_y) print "\n models
at different alpha values\n" alpa_values = [0.22,0.08,0.01] for alpha in
alpa_values: indices = get_coef(x,y,alpha) print "\t alpha = %0.2f number of
variables selected = %d\ " % (alpha,len(indices))#看保留下来的回归系数有多少 print "\t
attributes include ", indices#看保留下来的回归系数有哪些 x_new = x[:,indices] model =
LinearRegression(normalize=True) model.fit(x_new,y) predicted_y =
model.predict(x_new) model_worth(y,predicted_y)
岭回归+交叉验证迭代器
:针对于数据少的时候,然后把训练集划分为K份,模型再k-1份数据上进行驯良,剩下的用作测试,这样就不需要单独划分dev集,这种方法也叫K折交叉验证法。
# -*- coding: utf-8 -*- """ Created on Mon Apr 09 14:30:10 2018 @author: Alvin
AI """ from sklearn.datasets import load_boston from sklearn.cross_validation
import KFold,train_test_split from sklearn.linear_model import Ridge from
sklearn.grid_search import GridSearchCV from sklearn.metrics import
mean_squared_error from sklearn.preprocessing import PolynomialFeatures import
numpy as np #载入数据 def get_data(): data = load_boston() x = data['data'] y =
data['target'] return x,y #构建模型 def build_model(x,y): kfold =
KFold(y.shape[0],5)#K折交叉检验划分训练集和测试集,5份数据集(每份包括训练和测试) model =
Ridge(normalize=True)#标准化数据并采用岭回归模型 alpha_range =
np.linspace(0.0015,0.0017,30)#生成alpha测试集 grid_param = {"alpha":alpha_range}
#GridSearchCV帮助我们采用一个范围内参数对模型进行训练 #cv定义了感兴趣的交叉验证类型 grid =
GridSearchCV(estimator=model,param_grid=grid_param,cv=kfold,\
scoring='mean_squared_error') grid.fit(x,y)
display_param_results(grid.grid_scores_)#展示均方误差平均值 print
grid.best_params_#打印最好的参数和评估量 #追踪均方残差的计量用于绘制图形 return grid.best_estimator_
#查看回归系数和截距 def view_model(model): #print "\n estimated alpha = %0.3f" %
model.alpha_#打印模型采用的alpha值 print "\n model coeffiecients" print
"======================\n" for i,coef in enumerate(model.coef_): print "\t
coefficent %d %0.3f" % (i+1,coef) print "\n\t intercept %0.3f" %
(model.intercept_) #模型评估 def model_worth(true_y,predicted_y): print "\t Mean
squared error = %0.2f" % (mean_squared_error(true_y,predicted_y)) return
mean_squared_error(true_y,predicted_y) #展示参数结果 def
display_param_results(param_results): fold = 1 for param_result in
param_results: print "fold %d mean squared error %0.2f" %
(fold,abs(param_result[1]\ )),param_result[0] fold+=1 if __name__ ==
"__main__": x,y = get_data() #将数据集划分为训练集和测试集 x_train,x_test,y_train,y_test=
train_test_split(x,y,test_size=0.3,\ random_state=9) #准备一些多项式特征 poly_features =
PolynomialFeatures(interaction_only=True) x_train_poly =
poly_features.fit_transform(x_train) x_test_poly =
poly_features.fit_transform(x_test) choosen_model =
build_model(x_train_poly,y_train) predicted_y =
choosen_model.predict(x_train_poly) model_worth(y_train,predicted_y)
view_model(choosen_model) predicted_y = choosen_model.predict(x_test_poly)
model_worth(y_test,predicted_y)
热门工具 换一换