curve可以帮助我们判断模型现在所处的状态：过拟合（overfiting / high variance） or 欠拟合（underfitting /
high bias）

import numpy as np import matplotlib.pyplot as plt from sklearn.naive_bayes
import GaussianNB from sklearn.svm import SVC from sklearn.datasets import
sklearn.model_selectionimport ShuffleSplit def plot_learning_curve(estimator,
title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
""" 画出data在某模型上的learning curve. 参数解释 ---------- estimator : 你用的分类器。 title :

plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve( estimator, X, y,
cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean =
np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1) test_scores_std =
np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes,
train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean -
test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean,'o-', color="r", label="Training score"
) plt.plot(train_sizes, test_scores_mean,'o-', color="g", label=
"Cross-validation score") plt.legend(loc="best") plt.draw() plt.show() midpoint
= ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] -
test_scores_std[-1])) / 2 diff = (train_scores_mean[-1] + train_scores_std[-1])
- (test_scores_mean[-1] - test_scores_std[-1]) return midpoint, diff digits =
load_digits() X, y = digits.data, digits.target title ="Learning Curves (Naive
Bayes)" # Cross validation with 100 iterations to get smoother mean test and
train # score curves, each time with 20% data randomly selected as a validation
set. cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) estimator =
GaussianNB() plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01),
cv=cv, n_jobs=4) title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" #
SVC is more expensive so we do a lower number of CV iterations: cv =
ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) estimator = SVC(gamma=
0.001) plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)

<http://blog.csdn.net/aliceyangxi1987/article/details/73598857>

<http://blog.csdn.net/xlinsist/article/details/51344449>

<http://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html>