1. 普通线性回归:通过输出模型的真实值和预测值的平均平方差尽可能小(即最小二乘估计法),但容易陷入过度拟合(即低偏差),后续回归方法会有带正则化法来缩减数据。
2. 普通线性回归+RFE:RFE是recursive feature 
elimination回归特征消除,让回归特征消除过程中只保留no_features个最重要的特征,可以避免过度拟合,但RFE会舍弃一些变量,原没有下面几个方法给变量赋权重来的好。
3. L2缩减回归 - 岭回归:正则化那块采用L2范式,alpha越大,缩减幅度越大。岭回归比LASSO的预测能力好点,但LASSO能完成动态选择。
4. L1缩减回归 - LASSO:Least absolute shrinkage and selection 
operator最小绝对值缩减和选择操作,LASSO更偏向于稀疏的结果,如果一个结果大多数系数被压缩为0,那么它被称为系数的,LASSO大多数的系数都变成0了,对相关联的变量,只选择保留一个。
RFE:
# -*- coding: utf-8 -*- """ Created on Thu Apr 05 19:52:39 2018 @author: Alvin 
AI """ from sklearn.datasets import load_boston from sklearn.cross_validationi 
import train_test_split from sklearn.linear_model import LinearRegression from 
sklearn.metrics import mean_squared_error import matplotlib.pyplot as plt from 
sklearn.preprocessing import PolynomialFeatures from itertools import 
combinations from sklearn.feature_selection import RFE #载入数据 def get_data(): 
data = load_boston() x = data['data'] y = data['target'] return x,y #建立模型 
#让回归特征消除(RFE-recursive feature elimination)只保留no_features个最重要的特征 def 
build_model(x,y,no_features): model = 
LinearRegression(normalize=True,fit_intercept=True) rfe_model = 
RFE(estimator=model,n_features_to_select=no_features) rfe_model.fit(x,y) return 
rfe_model #查看模型 def view_model(model): print "\nmodel coefficients" print 
"===================\n" #coef_提供了一个系数矩阵,intercept_提供了回归常数 for i,coef in 
enumerate(model.coef_): print "\t coefficient %d %model"%(i+1,coef) print 
"\n\tintercept %0.3f"%(model.intercept_) #计算均平方差用以评估模型误差 def 
model_worth(true_y,predicted_y): print "\t mean squared error = 
%0.2f"%(mean_squared_error(true_y,predicted_y)) return 
mean_squared_error(true_y,predicted_y) #绘制残差图 def plot_residual(y,predicted_y): 
plt.cla() plt.xlabel('predicted y') plt.ylabel('residual') plt.title('residual 
plot') plt.figure1(1) diff = y - predicted_y plt.plot(predicted_y,diff,'go') 
plt.show() if __name__=="__main__": x,y = get_data() #划分数据集 
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\ 
test_size=0.3,random_state=9) x_dev,x_test,y_dev,y_test = 
train_test_split(x_test_all,y_test_all,\ test_size=0.3,random_state=9) 
#准备一些多项式特征 poly_features = 
PolynomialFeatures(interaction_only=True)#只有x1和x2交互一起的,x1^2这种不行 x_train_poly = 
poly_features.fit_transform(x_train) x_dev_poly = 
poly_features.fit_transform(x_dev) choosen_model = 
build_model(x_train_poly,y_train,20) predicted_y = 
choosen_model.predict(x_train_poly) mse = model_worth(y_train,predicted_y) 
x_test_poly = poly_features.fit_transform(x_test) predicted_y = 
choosen_model.predict(x_test_poly) model_worth(y_test,predicted_y)LASSO:
# -*- coding: utf-8 -*- """ Created on Mon Apr 09 09:08:51 2018 @author: Alvin 
AI """ from sklearn.datasets import load_boston from sklearn.model_selection 
import train_test_split from sklearn.linear_model import Lasso, 
LinearRegression from sklearn.metrics import mean_squared_error from 
sklearn.preprocessing import PolynomialFeatures import matplotlib.pyplot as plt 
import numpy as np #加载数据 def get_data(): data = load_boston() x = data['data'] 
y = data['target'] return x,y #建立模型 def build_models(x,y): alpha_range = 
np.linspace(0,0.5,200) model = Lasso(normalize=True)#只需要标准化,不需要中心化 
coeffiecients = [] #对每个alpha值适配模型 for alpha in alpha_range: 
model.set_params(alpha=alpha) model.fit(x,y) 
coeffiecients.append(model.coef_)#追踪系数用来绘图 #print coeffiecients #维度为200*13 
#绘制系数权重变化和对应的alpha值 #绘制模型的RMSE和对应的alpha值 coeff_path(alpha_range,coeffiecients) 
#查看系数值 #view_model(model) #查看回归系数值 def view_model(model): print "\n model 
coeffiecients" print "======================" for i,coef in 
enumerate(model.coef_): print "\t coefficient %d %0.3f" % (i+1,coef) print 
"\n\t intercept %0.3f" % (model.intercept_) #评估模型 def 
model_worth(true_y,predicted_y): print "\t mean squared error = %0.2f\n" % \ 
(mean_squared_error(true_y,predicted_y)) #绘制不同alpha值情况下的系数权重 def 
coeff_path(alpha_range,coeffiecients): plt.close('all') plt.cla() plt.figure(1) 
plt.xlabel("Alpha Values") plt.ylabel("coeffiecient weights for different alpha 
values") plt.plot(alpha_range,coeffiecients) 
plt.axis('tight')#修改x、y坐标的范围让所有的数据显示出来 plt.show() #主函数调用,查看保留下来的回归系数有哪些 def 
get_coef(x,y,alpha): model = Lasso(normalize=True,alpha=alpha) model.fit(x,y) 
coefs = model.coef_ indices = [i for i,coef in enumerate(coefs) if abs(coef) > 
0.0] return indices #电泳所有函数 if __name__ == "__main__": x,y = get_data() 
#用不用的alpha值多次建模,并绘出图形 build_models(x,y) print "\npredicting using all the 
variables\n" full_model = LinearRegression(normalize=True) full_model.fit(x,y) 
predicted_y = full_model.predict(x) model_worth(y,predicted_y) print "\n models 
at different alpha values\n" alpa_values = [0.22,0.08,0.01] for alpha in 
alpa_values: indices = get_coef(x,y,alpha) print "\t alpha = %0.2f number of 
variables selected = %d\ " % (alpha,len(indices))#看保留下来的回归系数有多少 print "\t 
attributes include ", indices#看保留下来的回归系数有哪些 x_new = x[:,indices] model = 
LinearRegression(normalize=True) model.fit(x_new,y) predicted_y = 
model.predict(x_new) model_worth(y,predicted_y)
岭回归+交叉验证迭代器
:针对于数据少的时候,然后把训练集划分为K份,模型再k-1份数据上进行驯良,剩下的用作测试,这样就不需要单独划分dev集,这种方法也叫K折交叉验证法。
# -*- coding: utf-8 -*- """ Created on Mon Apr 09 14:30:10 2018 @author: Alvin 
AI """ from sklearn.datasets import load_boston from sklearn.cross_validation 
import KFold,train_test_split from sklearn.linear_model import Ridge from 
sklearn.grid_search import GridSearchCV from sklearn.metrics import 
mean_squared_error from sklearn.preprocessing import PolynomialFeatures import 
numpy as np #载入数据 def get_data(): data = load_boston() x = data['data'] y = 
data['target'] return x,y #构建模型 def build_model(x,y): kfold = 
KFold(y.shape[0],5)#K折交叉检验划分训练集和测试集,5份数据集(每份包括训练和测试) model = 
Ridge(normalize=True)#标准化数据并采用岭回归模型 alpha_range = 
np.linspace(0.0015,0.0017,30)#生成alpha测试集 grid_param = {"alpha":alpha_range} 
#GridSearchCV帮助我们采用一个范围内参数对模型进行训练 #cv定义了感兴趣的交叉验证类型 grid = 
GridSearchCV(estimator=model,param_grid=grid_param,cv=kfold,\ 
scoring='mean_squared_error') grid.fit(x,y) 
display_param_results(grid.grid_scores_)#展示均方误差平均值 print 
grid.best_params_#打印最好的参数和评估量 #追踪均方残差的计量用于绘制图形 return grid.best_estimator_ 
#查看回归系数和截距 def view_model(model): #print "\n estimated alpha = %0.3f" % 
model.alpha_#打印模型采用的alpha值 print "\n model coeffiecients" print 
"======================\n" for i,coef in enumerate(model.coef_): print "\t 
coefficent %d %0.3f" % (i+1,coef) print "\n\t intercept %0.3f" % 
(model.intercept_) #模型评估 def model_worth(true_y,predicted_y): print "\t Mean 
squared error = %0.2f" % (mean_squared_error(true_y,predicted_y)) return 
mean_squared_error(true_y,predicted_y) #展示参数结果 def 
display_param_results(param_results): fold = 1 for param_result in 
param_results: print "fold %d mean squared error %0.2f" % 
(fold,abs(param_result[1]\ )),param_result[0] fold+=1 if __name__ == 
"__main__": x,y = get_data() #将数据集划分为训练集和测试集 x_train,x_test,y_train,y_test= 
train_test_split(x,y,test_size=0.3,\ random_state=9) #准备一些多项式特征 poly_features = 
PolynomialFeatures(interaction_only=True) x_train_poly = 
poly_features.fit_transform(x_train) x_test_poly = 
poly_features.fit_transform(x_test) choosen_model = 
build_model(x_train_poly,y_train) predicted_y = 
choosen_model.predict(x_train_poly) model_worth(y_train,predicted_y) 
view_model(choosen_model) predicted_y = choosen_model.predict(x_test_poly) 
model_worth(y_test,predicted_y)
热门工具 换一换