1、股价数据
2、对该股票（公司）的情感数据

<http://www.nltk.org/>（Natural Language Toolkit）来进行处理。

<https://www.jianshu.com/p/2fcd1884bcfa>

import numpy as np import pandas as pd import unicodedata import
matplotlib.pyplotas plt from datetime import datetime, timedelta from
import treeinterpreter as ti from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression from sklearn.neural_network
import MLPClassifier

df_stocks['prices'] = df_stocks['adj close'].apply(np.int64) df_stocks =
df_stocks[['prices', 'articles']] df_stocks['articles'] = df_stocks['articles'
].map(lambda x: x.lstrip('.-'))

print(df_stocks)

prices articles 2007-01-01 12469 What Sticks from '06. Somalia Orders
Islamist... 2007-01-02 12472 Heart Health: Vitamin Does Not Prevent Death ...
12480 Helping Make the Shift From Combat to Commerc... 2007-01-05 12398 Rise in
Ethanol Raises Concerns About Cornas... 2007-01-06 12406 A Status Quo Secretary
General. Best Buyand ... 2007-01-07 12414 THE COMMON APPLICATION; Typo.com.
Jumbo Bonus... ... ... ...2016-12-31 19762 Terrorist Attack at Nightclub in
Istanbul Kill... [3653 rows x 2 columns] Process finished with exit code 0

Series独立出来，成为一个单独的DataFrame对象。因为我们对股票数据进行分析，并且不想破坏原DataFrame。在独立出来Price之后，我们再添加几个新的Series，接下来就是使用NLTK对文章进行情感分析了。
df = df_stocks[['prices']].copy() df["compound"] = ''#合成 df["neg"] = ''#负面 df[
"neu"] = ''#中立 df["pos"] = ''#积极

Series用来存放该新闻的负面指数，neu Series用来存放该新闻的中立指数，pos
Series用来存放该新闻的正面（积极）指数，Compound用来存放该新闻的合成（将neg neu pos结合）指数。
in df_stocks.T.iteritems(): try: sentence = unicodedata.normalize('NFKD',
df_stocks.loc[date, 'articles']) ss = sid.polarity_scores(sentence) df.at[date,
'compound'] = ss['compound'] df.at[date, 'neg'] = ss['neg'] df.at[date, 'neu']
= ss['neu'] df.at[date, 'pos'] = ss['pos'] except TypeError:
print(df_stocks.loc[date, 'articles']) print(date)

compound neg neu pos2007-01-01 12469 -0.9814 0.159 0.749 0.093 2007-01-02 12472
-0.8179 0.114 0.787 0.099 2007-01-03 12474 -0.9993 0.198 0.737 0.065 ... ... ...
... ... ... 2016-12-28 19833 0.2869 0.128 0.763 0.108 2016-12-29 19819 -0.9789
0.138 0.764 0.097 2016-12-30 19762 -0.995 0.168 0.734 0.098 2016-12-31 19762 -
0.2869 0.173 0.665 0.161 [3653 rows x 5 columns] Process finished with exit code
0

train_start_date = '2007-01-01' train_end_date = '2014-12-31' test_start_date =
'2015-01-01' test_end_date = '2016-12-31' train = df.ix[train_start_date :
train_end_date] test =df.ix[test_start_date:test_end_date]

sentiment_score_list = [] for date, row in train.T.iteritems():
sentiment_score = np.asarray([df.loc[date, 'neg'], df.loc[date, 'pos']])
sentiment_score_list.append(sentiment_score) numpy_df_train =
np.asarray(sentiment_score_list) sentiment_score_list = []for date, row in
train.T.iteritems(): sentiment_score = np.asarray([df.loc[date, 'neg'], df.loc[
date, 'pos']]) sentiment_score_list.append(sentiment_score) numpy_df_train =
np.asarray(sentiment_score_list)

y_train = pd.DataFrame(train['prices']) y_test = pd.DataFrame(test['prices'])

rf = RandomForestRegressor() rf.fit(numpy_df_train, y_train)
#print(rf.feature_importances_) prediction, bias, contributions = ti.predict
(rf, numpy_df_test) print(preditcion)

#Matplot idx = pd.date_range(test_start_date, test_end_date) predictions_df =
pd.DataFrame(data=prediction[0:731], index=idx, columns=['prices']) print
(predictions_df) predictions_plot = predictions_df.plot() fig = y_test.plot
(ax=predictions_plot).get_figure() ax = predictions_df.rename(columns={"Price":
"Predicted Price"}).plot(title='Random Forest Predict Stock Price') ax.
set_xlabel("Date") ax.set_ylabel("Price") fig = y_test.rename(columns={"Price":
"Actual Price"}).plot(ax=ax).get_figure() fig.savefig("RF_noSmoothing.png")

temp_date = test_start_date average_last_5_days_test = 0 total_days = 10 for i
in range(total_days): average_last_5_days_test += test.loc[temp_date,'prices']
temp_date = datetime.strptime(temp_date,"%Y-%m-%d").date() difference =
temp_date + timedelta(days=1) temp_date = difference.strftime('%Y-%m-%d')
average_last_5_days_test = average_last_5_days_test / total_daysprint
(average_last_5_days_test) temp_date = test_start_date
average_upcoming_5_days_predicted =0 for i in range(total_days):
average_upcoming_5_days_predicted += predictions_df.loc[temp_date,'prices']
temp_date = datetime.strptime(temp_date,"%Y-%m-%d").date() difference =
temp_date + timedelta(days=1) temp_date = difference.strftime('%Y-%m-%d') print
(temp_date) average_upcoming_5_days_predicted =
average_upcoming_5_days_predicted / total_daysprint
(average_upcoming_5_days_predicted) difference_test_predicted_prices =
average_last_5_days_test - average_upcoming_5_days_predictedprint
(difference_test_predicted_prices) predictions_df['prices'] = predictions_df[
'prices'] + difference_test_predicted_prices

# RF plot aligned ax = predictions_df.rename(columns={"prices":
"predicted_price"}).plot(title='Random Forest Predict Stock Price Aligned')
ax.set_xlabel("Dates") ax.set_ylabel("Stock Prices") fig = y_test.rename
(columns={"prices": "actual_price"}).plot(ax = ax).get_figure() fig.savefig(
"RF_aligned.png")

Weighted Moving-Average，指数加权移动平均值的控制图）方法来进行。
# Pandas EWMA # predictions_df['ewma'] = pd.ewma(predictions_df["prices"],
span=60, freq="D").mean() predictions_df['ewm'] = \ predictions_df["prices"
predictions_df['actual_value'] = test['prices'] # predictions_df[
'actual_value_ewma'] = pd.ewma(predictions_df["actual_value"], span=60, freq="D"
).mean() predictions_df['actual_value_ewm'] = \ predictions_df["actual_value"
predictions_df.columns = ['predicted_price', 'average_predicted_price',
'actual_price', 'average_actual_price']

# RF smoothed predictions_plot = predictions_df.plot(title='Random Forest
Predict Stock Price Aligned and Smoothed') predictions_plot.set_xlabel("Dates")
predictions_plot.set_ylabel("Stock Prices") fig = predictions_plot.get_figure()
fig.savefig("RF_smoothed.png")

# 只绘制平滑后的实际股市走势与预测走势的折现 predictions_df_average = predictions_df
[['Average_predicted_price', 'Average_actual_price']] predictions_plot =
predictions_df_average.plot(title='Random Forest Predict Stock Price Aligned
and Smoothed') predictions_plot.set_xlabel("Dates") predictions_plot.set_ylabel(
"Prices") fig = predictions_plot.get_figure() fig.savefig(
"RF_smoothed_and_actual_price.png")

def LR_prediction(): years = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
2015, 2016] prediction_list = [] for year in years: # 划分训练集测试集 train_start_date
= str(year) +'-01-01' train_end_date = str(year) + '-10-31' test_start_date =
str(year) +'-11-01' test_end_date = str(year) + '-12-31' train =
df.ix[train_start_date: train_end_date] test =
df.ix[test_start_date:test_end_date]# 计算情感分数 sentiment_score_list = [] for
date, rowin train.T.iteritems(): sentiment_score = np.asarray( [df.loc[date,
'compound'], df.loc[date, 'neg'], df.loc[date, 'neu'], df.loc[date, 'pos']])
sentiment_score_list.append(sentiment_score) numpy_df_train =
np.asarray(sentiment_score_list) sentiment_score_list = []for date, row in
test.T.iteritems(): sentiment_score = np.asarray( [df.loc[date,'compound'],
df.loc[date,'neg'], df.loc[date, 'neu'], df.loc[date, 'pos']])
sentiment_score_list.append(sentiment_score) numpy_df_test =
np.asarray(sentiment_score_list)# 线性回归模型 lr = LogisticRegression()
lr.fit(numpy_df_train, train['prices']) prediction = lr.predict(numpy_df_test)
prediction_list.append(prediction) idx = pd.date_range(test_start_date,
test_end_date) predictions_df_list = pd.DataFrame(data=prediction[0:],
index=idx, columns=['prices']) difference_test_predicted_prices =
offset_value(test_start_date, test, predictions_df_list)# 对齐
predictions_df_list['prices'] = predictions_df_list['prices'] +
difference_test_predicted_prices predictions_df_list# 平滑 predictions_df_list[
'ewm'] = predictions_df_list["prices"].ewm(span=10,freq='D').mean()
predictions_df_list['actual_value'] = test['prices'] predictions_df_list[
'actual_value_ewma'] = predictions_df_list["actual_value"].ewm(span=10, freq='D'
).mean()# 更改Series名称 predictions_df_list.columns = ['predicted_price',
'average_predicted_price', 'actual_price', 'average_actual_price']
predictions_df_list.plot() predictions_df_list_average = predictions_df_list[[
'average_predicted_price', 'average_actual_price']]
predictions_df_list_average.plot()# 只绘制平滑后的实际股市走势与预测走势的折现 predictions_plot =
predictions_df_list_average.plot(title='Linear Regression Predict Stock Price
Aligned and Smoothed') predictions_plot.set_xlabel("Dates")
predictions_plot.set_ylabel("Prices") fig = predictions_plot.get_figure()
fig.savefig("LR_smoothed_and_actual_price.png") plt.show()

def MLP_prediction(): years = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
2015, 2016] prediction_list = [] for year in years: # 分割数据集与测试集
train_start_date = str(year) +'-01-01' train_end_date = str(year) + '-10-31'
test_start_date = str(year) +'-11-01' test_end_date = str(year) + '-12-31'
train = df.ix[train_start_date: train_end_date] test =
df.ix[test_start_date:test_end_date]# 计算情感分数 sentiment_score_list = [] for
date, rowin train.T.iteritems(): sentiment_score = np.asarray( [df.loc[date,
'compound'], df.loc[date, 'neg'], df.loc[date, 'neu'], df.loc[date, 'pos']])
sentiment_score_list.append(sentiment_score) numpy_df_train =
np.asarray(sentiment_score_list) sentiment_score_list = []for date, row in
test.T.iteritems(): sentiment_score = np.asarray( [df.loc[date,'compound'],
df.loc[date,'neg'], df.loc[date, 'neu'], df.loc[date, 'pos']])
sentiment_score_list.append(sentiment_score) numpy_df_test =
np.asarray(sentiment_score_list)# 创建MLP模型 mlpc =
MLPClassifier(hidden_layer_sizes=(100, 200, 100), activation='relu', solver=
'lbfgs', alpha=0.005, learning_rate_init=0.001, shuffle=False) # span = 20 #
best 1 mlpc.fit(numpy_df_train, train['prices']) prediction =
mlpc.predict(numpy_df_test) prediction_list.append(prediction) idx =
pd.date_range(test_start_date, test_end_date) predictions_df_list =
pd.DataFrame(data=prediction[0:], index=idx, columns=['prices'])
difference_test_predicted_prices = offset_value(test_start_date, test,
predictions_df_list) predictions_df_list['prices'] = predictions_df_list[
'prices'] + difference_test_predicted_prices predictions_df_list # 平滑
predictions_df_list['ewma'] = predictions_df_list["prices"].ewm(span=20, freq=
'D').mean() predictions_df_list['actual_value'] = test['prices']
predictions_df_list['actual_value_ewma'] = predictions_df_list["actual_value"
].ewm(span=20, freq='D').mean() predictions_df_list.columns = ['predicted_price'
,'average_predicted_price', 'actual_price', 'average_actual_price']
predictions_df_list.plot() predictions_df_list_average = predictions_df_list[[
'average_predicted_price', 'average_actual_price']]
predictions_df_list_average.plot() plt.show()

Science方面的强大能力。在生活中，我们可以通过选择合适的算法，编写如微博情感分析、聊天机器人、图像识别、语音识别、天气预测等便及生活的人工智能应用。