本文將從四個案例 房價預測、泰坦尼克號生還預測、股票預測、影評情感預測 入手,讓童鞋們從實戰(zhàn)角度快速入門深度學習的預測部分!
房價預測
基于決策樹回歸器(DecisionTreeRegressor)
數(shù)據(jù)文件在這:
鏈接:https://pan.baidu.com/s/1mPr60cFUSc5m7pmF8Ju4vw 提取碼:j2b0
#基于DecisionTreeRegressor預測北京房價
import numpy
import pandas as pd
import matplotlib
import seaborn
from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
import tensorflow
import numpy as np
#定義一堆函數(shù)
# 定義網(wǎng)格搜索最佳模型函數(shù)
def gridSearchVC_fit_model(X, y):
# 清洗和分割數(shù)據(jù)對象定義,
# 參數(shù)一:n_splits表示重新清洗和分割數(shù)據(jù)的迭代次數(shù),默認值就是10
# 參數(shù)二:test_size=0.2表示有0.2的數(shù)據(jù)用于測試,也就是20%的測試數(shù)據(jù),80%的訓練數(shù)據(jù)
# 參數(shù)三:random_state表示隨機數(shù)生成器的種子,如果希望第二次調(diào)用ShuffleSplit()方法時
# 和第一次調(diào)用的結果一致,那么就可以設置一個值,多少都可以,生產(chǎn)環(huán)境不要設值
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
# 創(chuàng)建決策樹回歸器對象
regressor = DecisionTreeRegressor(random_state=0)
# 創(chuàng)建一個字典,表示max_depth的值是從1到10
# 注意:如果是Python2的話,這個list()函數(shù)調(diào)用去掉
params = { "max_depth" : list(range(1, 10)) }
# 通過make_scorer()函數(shù)將上面定義的performance_metric()函數(shù)轉(zhuǎn)換成計算分值函數(shù)
scoring_fnc = make_scorer(score_func=performance_metric)
# 創(chuàng)建網(wǎng)格搜索對象
# 參數(shù)一:評估器,就是回歸器,這里表示的是決策樹回歸器
# 參數(shù)二:網(wǎng)格搜索參數(shù)
# 參數(shù)三:計算分值函數(shù)
# 參數(shù)四:cv(Cross-Validation)交叉驗證,傳入交叉驗證生成器,或者可迭代對象
grid = GridSearchCV(estimator=regressor, param_grid=params,
scoring=scoring_fnc, cv=cv)
# 根據(jù)數(shù)據(jù)計算/訓練適合網(wǎng)格搜索對象的最佳模型
grid = grid.fit(X, y)
# 返回計算得到的最佳模型
return grid.best_estimator_
# 預測房屋價格
def PredictHousingPrice(X, y, fitter):
# 迭代10次
epochs = 10
# 存儲預測的價格
y_predict_test_price = None
# 分割訓練集和測試集數(shù)據(jù)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=0)
# 迭代訓練
for epoch_i in range(epochs):
# 根據(jù)數(shù)據(jù)訓練模型,并返回最佳模型
reg = fitter(X_train, y_train)
# 預測測試數(shù)據(jù)
predicted_price = reg.predict(X_test)
y_predict_test_price = predicted_price
print("迭代第{}次。".format(epoch_i+1))
return y_test, y_predict_test_price
# 顯示真實的房價和預測房價對比圖
def plotVersusFigure(y_true_price, y_predict_price):
# 創(chuàng)建一個10x7英寸的窗口大小
plt.figure(figsize=(10, 7))
# 繪制的圖1是真實的房價
X_show = np.rint(np.linspace(1,
np.max(y_true_price),
len(y_true_price))
).astype(int)
# 繪制圖1線,plot()方法:
# 參數(shù)1:X軸方向的值,真實房價最低價和最高價
# 參數(shù)2:y軸方向的值,真實房價的值
# 參數(shù)3:繪制出來的線的樣式風格,比如這里的"o"表示一個圓圈標記,而"-"表示實線
# 參數(shù)4:繪制的線的顏色,這里是青色
plt.plot(X_show, y_true_price, 'o-', color='c')
# 繪制的圖2是預測的房價,疊加在圖1上
X_show_predicted = np.rint(np.linspace(1,
np.max(y_predict_price),
len(y_predict_price))
).astype(int)
# 繪制圖2線,plot()方法:
# 參數(shù)1:X軸方向的值,預測房價最低價和最高價
# 參數(shù)2:y軸方向的值,預測房價的值
# 參數(shù)3:繪制出來的線的樣式風格,比如這里的"o"表示一個圓圈標記,而"-"表示實線
# 參數(shù)4:繪制的線的顏色,這里是洋紅色
plt.plot(X_show_predicted, y_predict_price, 'o-', color='m')
# 添加標題
plt.title('Housing Prices Prediction')
# 添加圖例
plt.legend(loc='lower right', labels=["True Prices", "Predicted Prices"])
# 添加X軸的標題
plt.xlabel("House's Price Tendency By Array")
# 添加y軸的標題
plt.ylabel("House's Price")
# 顯示繪制
plt.show()
#開搞!
# 根據(jù)北京的房價數(shù)據(jù)來預測
# 加載數(shù)據(jù)集
df = pd.read_csv('bj_housing.csv')
df.describe()
bj_prices = df['Value']
bj_prices.head()
bj_features = df.drop('Value', axis=1)
bj_features.head()
y_true_bj_price, y_predict_bj_price =
PredictHousingPrice(bj_features, bj_prices, gridSearchVC_fit_model)
y_true_bj_price.reset_index().drop('index', axis=1).head()
pd.Series(y_predict_bj_price).head()
# 北京房屋價格對比圖
plotVersusFigure(y_true_bj_price, y_predict_bj_price)
基于Keras
# 使用Keras來預測波士頓的房價預測
import tensorflow as tf
from tensorflow import keras
import numpy as np
# 加載波士頓的房價數(shù)據(jù)
(train_data, train_labels), (test_data, test_labels) =
keras.datasets.boston_housing.load_data()
# 清洗訓練集數(shù)據(jù)
# np.random.random()表示在0.0到1.0之間返回指定個數(shù)的隨機浮點數(shù)
# np.argsort()表示返回對數(shù)組進行排序的索引
order = np.argsort(np.random.random(train_labels.shape))
train_data = train_data[order]
train_labels = train_labels[order]
# 歸一化處理數(shù)據(jù)
# 對不同的范圍和比例進行歸一化處理,并且每個元素都要減去均值除以標準差
# 模型雖然在沒有特征歸一化時也可以得到收斂,但是這會讓訓練更加困難,
# 而且會是結果模型很依賴于訓練數(shù)據(jù)
mean = train_data.mean(axis=0)
std = train_data.std(axis=0)
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std
print("train_data.shape: {}, train_labels.shape: {}."
.format(train_data.shape, train_labels.shape))
print("test_data.shape: {}, test_labels.shape: {}."
.format(test_data.shape, test_labels.shape))
# 創(chuàng)建模型函數(shù)
def build_model():
model = keras.Sequential([
keras.layers.Dense(64, activation=tf.nn.relu,
input_shape=(train_data.shape[1],)),
keras.layers.Dense(64, activation=tf.nn.relu),
keras.layers.Dense(1)
])
optimizer = tf.train.RMSPropOptimizer(0.001)
model.compile(loss='mse',
optimizer=optimizer,
metrics=['mae'])
return model
model = build_model()
# 查看模型的架構
model.summary()
# 自定義一個回調(diào)類,在每次epoch(代)結束時都會調(diào)用該函數(shù)
class PrintDot(keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs):
if epoch % 100 == 0: print('')
print('.', end='')
EPOCHS = 500
# 訓練模型
history = model.fit(train_data, train_labels, epochs=EPOCHS,
validation_split=0.2, verbose=0,
callbacks=[PrintDot()])
import matplotlib.pyplot as plt
# 繪制圖來顯示訓練的誤差歷史
def plot_history(history):
plt.figure()
plt.xlabel('Epoch')
plt.ylabel('Mean Abs Error [1000$]')
plt.plot(history.epoch, np.array(history.history['mean_absolute_error']),
label='Train Loss')
plt.plot(history.epoch, np.array(history.history['val_mean_absolute_error']),
label='Val loss')
plt.legend()
plt.ylim([0, 5])
plt.show()
plot_history(history)
# 評估模型
[loss, mae] = model.evaluate(test_data, test_labels, verbose=0)
print("Testing set Mean Abs Error: ${:7.2f}".format(mae * 1000))
# 預測模型
test_predictions = model.predict(test_data).flatten()
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [1000$]')
plt.ylabel('Predictions [1000$]')
plt.axis('equal')
plt.xlim(plt.xlim())
plt.ylim(plt.ylim())
plt.plot([-100, 100], [-100, 100])
plt.show()
# 查看預測值與真實的值得誤差
error = test_predictions - test_labels
plt.hist(error, bins=50)
plt.xlabel("Prediction Error [1000$]")
plt.ylabel("Count")
plt.show()
# 顯示真實的房價和預測房價對比圖
def plotVersusFigure(y_true_price, y_predict_price):
# 創(chuàng)建一個10x7英寸的窗口大小
plt.figure(figsize=(10, 7))
# 繪制的圖1是真實的房價
X_show = np.rint(np.linspace(1,
np.max(y_true_price),
len(y_true_price))
).astype(int)
# 繪制圖1線,plot()方法:
# 參數(shù)1:X軸方向的值,真實房價最低價和最高價
# 參數(shù)2:y軸方向的值,真實房價的值
# 參數(shù)3:繪制出來的線的樣式風格,比如這里的"o"表示一個圓圈標記,而"-"表示實線
# 參數(shù)4:繪制的線的顏色,這里是青色
plt.plot(X_show, y_true_price, 'o-', color='c')
# 繪制的圖2是預測的房價,疊加在圖1上
X_show_predicted = np.rint(np.linspace(1,
np.max(y_predict_price),
len(y_predict_price))
).astype(int)
# 繪制圖2線,plot()方法:
# 參數(shù)1:X軸方向的值,預測房價最低價和最高價
# 參數(shù)2:y軸方向的值,預測房價的值
# 參數(shù)3:繪制出來的線的樣式風格,比如這里的"o"表示一個圓圈標記,而"-"表示實線
# 參數(shù)4:繪制的線的顏色,這里是洋紅色
plt.plot(X_show_predicted, y_predict_price, 'o-', color='m')
# 添加標題
plt.title('Housing Prices Prediction')
# 添加圖例
plt.legend(loc='lower right', labels=["True Prices", "Predicted Prices"])
# 添加X軸的標題
plt.xlabel("House's Price Tendency By Array")
# 添加y軸的標題
plt.ylabel("House's Price")
# 顯示繪制
plt.show()
# 對比真實的值和預測的值的圖
plotVersusFigure(test_labels, test_predictions)
泰坦尼克號生還預測
提供1309行泰坦尼克號乘客數(shù)據(jù),其中891行是訓練數(shù)據(jù),418行是測試數(shù)據(jù),一共有12列,其中有一列表示乘客是否生還。
下面用sklearn(決策樹、邏輯回歸、梯度提升、多層感知機)和keras(DNN)實現(xiàn)乘客生還預測。
數(shù)據(jù)文件在這:
鏈接:https://pan.baidu.com/s/1o_FUa_4VxmqXVBMBGh4rog 提取碼:apzg
基于Sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# 加載數(shù)據(jù)
features = pd.read_csv('titanic_dataset.csv')
y_train = features['Survived']
X_train = features.drop('Survived', axis=1)
# 預覽前5條數(shù)據(jù)
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
X_train.info()
# 先看下數(shù)據(jù)集的 Age 分布狀態(tài)
sns.distplot(X_train['Age'].dropna(), hist=True, kde=True)
# 將數(shù)據(jù)集中的NaN數(shù)據(jù)使用中值填充。
X_train['Age'].replace(np.nan, np.nanmedian(X_train['Age']), inplace=True)
sns.distplot(X_train['Age'], hist=True, kde=True)
# Cabin 的缺失值太多,從 Dataframe 中移除后,也不會影響預測的
X_train.drop("Cabin", axis=1, inplace=True)
# 我們來看下乘客都在哪些站登船的
# S 表示:Southampton,英國南安普敦
# C 表示:Cherbourg-Octeville,法國瑟堡-奧克特維爾
# Q 表示:Queenstown,愛爾蘭昆士敦
X_train.Embarked.value_counts()
# 登船情況
sns.countplot(x='Embarked', data=X_train)
X_train['Embarked'].replace(np.nan, 'S', inplace=True)
# 數(shù)據(jù)集有一個缺失數(shù)據(jù),我們把它找出來,然后附上中值
X_train[np.isnan(X_train["Fare"])]
# 查詢從 英國南安普敦 上傳,級別是3的船票價格
pclass3_fares = X_train.query('Pclass == 3 & Embarked == "S"')['Fare']
# 先將空值填充為0
pclass3_fares = pclass3_fares.replace(np.nan, 0)
# 然后取中值
median_fare = np.median(pclass3_fares)
# 最后更新中值到缺失值的那處
X_train.loc[X_train['PassengerId'] == 1044, 'Fare'] = median_fare
X_train['Sex'].replace(['male', 'female'], [1,0], inplace=True)
X_train.isnull().sum()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
X_train = pd.get_dummies(X_train)
# 預覽 one-hot encoding 前5條數(shù)據(jù)
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)
print("train_X.shape={}, train_y.shape={}".format(train_X.shape, train_y.shape))
print("test_X.shape={}, test_y.shape={}".format(test_X.shape, test_y.shape))
# 使用決策樹預測模型
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
# 創(chuàng)建決策樹模型
def createDecisionTreeClassifier():
model = DecisionTreeClassifier()
# 訓練模型
model.fit(train_X, train_y)
# 預測
train_pred = model.predict(train_X)
test_pred = model.predict(test_X)
# 計算精確度
train_accuracy = accuracy_score(train_y, train_pred)
test_accuracy = accuracy_score(test_y, test_pred)
print('The training accuracy is {}.'.format(train_accuracy))
print('The test accuracy is {}'.format(test_accuracy))
# ROC curve and AUC
y_score_dt = model.predict_proba(test_X)
fpr_dt, tpr_dt, thresholds_dt = metrics.roc_curve(test_y, y_score_dt[:,1])
print('Decision Tree Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_dt[:,1])))
return fpr_dt, tpr_dt
fpr_dt, tpr_dt = createDecisionTreeClassifier()
# 創(chuàng)建邏輯回歸預測模型
from sklearn.linear_model import LogisticRegression
def createLogisticRegressionModel():
model = LogisticRegression()
model.fit(train_X, train_y)
print('Logistic Regression Accuracy for training data is: {:.3f}'.format(model.score(train_X, train_y)))
print('Logistic Regression Accuracy for testing data is: {:.3f}'.format(model.score(test_X, test_y)))
y_score_lr = model.decision_function(test_X)
print('Logistic Regression AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_lr)))
fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(test_y, y_score_lr)
return fpr_lr, tpr_lr
fpr_lr, tpr_lr = createLogisticRegressionModel()
# 創(chuàng)建梯度提升模型
from sklearn.ensemble import GradientBoostingClassifier
def createGradientBoostingClassifierModel():
model = GradientBoostingClassifier(n_estimators = 500)
model.fit(train_X, train_y)
# 預測
train_pred = model.predict(train_X)
test_pred = model.predict(test_X)
print('Gradient Boosting Accuracy for training data is: {:.3f}'.format(accuracy_score(train_y, train_pred)))
print('Gradient Boosting Accuracy for testing data is: {:.3f}'.format(accuracy_score(test_y, test_pred)))
# ROC 曲線 和 AUC
y_score_gb = model.predict_proba(test_X)
fpr_gb, tpr_gb, thresholds_gb = metrics.roc_curve(test_y, y_score_gb[:,1])
print('Gradient Boosting Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_gb[:,1])))
return fpr_gb, tpr_gb
fpr_gb, tpr_gb = createGradientBoostingClassifierModel()
# 創(chuàng)建多層感知器的預測模型
from sklearn.neural_network import MLPClassifier
def createMLPClassifierModel():
model = MLPClassifier(hidden_layer_sizes=128, batch_size=64, max_iter=1000, solver="adam")
model.fit(train_X, train_y)
# 預測
train_pred = model.predict(train_X)
test_pred = model.predict(test_X)
print('Neural Network classifier Accuracy for training data is: {:.3f}'.format(accuracy_score(train_y, train_pred)))
print('Neural Network classifier Accuracy for testing data is: {:.3f}'.format(accuracy_score(test_y, test_pred)))
# ROC curve and AUC
y_score_nn = model.predict_proba(test_X)
fpr_nn, tpr_nn, thresholds_nn = metrics.roc_curve(test_y, y_score_nn[:,1])
print('Neural Network Classifier AUC is: {:.3f}'.format(metrics.roc_auc_score(test_y, y_score_nn[:,1])))
return fpr_nn, tpr_nn
fpr_nn, tpr_nn = createMLPClassifierModel()
# 全部模型的訓練曲線畫圖!
fig = plt.figure(figsize = (20,10))
ax = fig.add_subplot(111)
ax1 = ax.plot(fpr_dt, tpr_dt, c='c', lw=2, label="Decision Tree")
ax2 = ax.plot(fpr_lr, tpr_lr, c='y', lw=2, label="Logistic Regression")
ax3 = ax.plot(fpr_gb, tpr_gb, c='r', lw=2, label="Gradient Boosting")
ax4 = ax.plot(fpr_nn, tpr_nn, c='b', lw=2, label="Neural Network")
ax.grid()
lns = ax1 + ax2 + ax3 + ax4
ax.legend(lns, loc=0)
plt.show()
train_X.shape
基于Keras
# Keras的神經(jīng)網(wǎng)絡模型來預測
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import utils as np_utils
# 加載數(shù)據(jù)
features = pd.read_csv('titanic_dataset.csv')
y_train = features['Survived']
X_train = features.drop('Survived', axis=1)
# 將數(shù)據(jù)集中的NaN數(shù)據(jù)使用中值填充。
X_train['Age'].replace(np.nan, np.nanmedian(X_train['Age']), inplace=True)
sns.distplot(X_train['Age'], hist=True, kde=True)
# Cabin 的缺失值太多,從 Dataframe 中移除后,也不會影響預測的
X_train.drop("Cabin", axis=1, inplace=True)
X_train.Embarked.value_counts()
# 登船情況
sns.countplot(x='Embarked', data=X_train)
X_train['Embarked'].replace(np.nan, 'S', inplace=True)
# 數(shù)據(jù)集有一個缺失數(shù)據(jù),我們把它找出來,然后附上中值
X_train[np.isnan(X_train["Fare"])]
# 查詢從 英國南安普敦 上傳,級別是3的船票價格
pclass3_fares = X_train.query('Pclass == 3 & Embarked == "S"')['Fare']
# 先將空值填充為0
pclass3_fares = pclass3_fares.replace(np.nan, 0)
# 然后取中值
median_fare = np.median(pclass3_fares)
# 最后更新中值到缺失值的那處
X_train.loc[X_train['PassengerId'] == 1044, 'Fare'] = median_fare
X_train['Sex'].replace(['male', 'female'], [1,0], inplace=True)
X_train.isnull().sum()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
X_train = pd.get_dummies(X_train)
# 預覽 one-hot encoding 前5條數(shù)據(jù)
X_train.head()
print("X_train.shape={}, y_train.shape={}".format(X_train.shape, y_train.shape))
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)
print("train_X.shape={}, train_y.shape={}".format(train_X.shape, train_y.shape))
print("test_X.shape={}, test_y.shape={}".format(test_X.shape, test_y.shape))
def createKerasModel(X, y):
# 創(chuàng)建模型
model = Sequential()
# 內(nèi)核初始化器就使用截斷正態(tài)分布
initializers = keras.initializers.TruncatedNormal(mean=0.0, stddev=0.05, seed=None)
# 輸入層維度是 X.shape[1]
model.add(Dense(input_dim=X.shape[1], units=128, kernel_initializer=initializers, bias_initializer='zeros'))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Dense(32))
model.add(Activation("relu"))
model.add(Dense(2))
# 輸出的結果是要么1,要么0,所以使用 sigmoid激活函數(shù)
model.add(Activation("sigmoid"))
# 編譯使用二進制交叉熵,adam優(yōu)化器自行調(diào)整
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 將訓練數(shù)據(jù)的y進行獨熱編碼(one-hot encoding)
y_train_categorical = np_utils.to_categorical(y)
# 訓練模型,epochs表示要訓練150次,verbose表示訓練每批次時輸出日志信息
model.fit(X.values, y_train_categorical, epochs=150, verbose=1)
return model
keras_model = createKerasModel(train_X, train_y)
y_test_categorical = np_utils.to_categorical(test_y)
loss_and_accuracy = keras_model.evaluate(test_X.values, y_test_categorical)
print("Loss={}, Accuracy={}.".format(loss_and_accuracy[0], loss_and_accuracy[1]))
predictions_classes = keras_model.predict_classes(test_X.values)
submission = pd.DataFrame({
"PassengerId": test_X["PassengerId"],
"Survived": predictions_classes})
print(submission[0:15])
股票預測
根據(jù)3000多條的百度股票數(shù)據(jù),預測出股票曲線。
數(shù)據(jù)通過quandl開源庫獲取,使用Facebook開源的fbprophet庫來進行股票價格預測。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install quandl
import quandl
!pip install fbprophet
import fbprophet
def init_api_key():
quandl.save_key("Your API Key")
print(quandl.ApiConfig.api_key)
init_api_key()
quandl.read_key()
print(quandl.ApiConfig.api_key)
def init_stock(stock_name):
#獲取股票數(shù)據(jù)
stock = quandl.get("WIKI/{}".format(stock_name))
#設置列Date為第一列
stock = stock.reset_index(level=0)
return stock
#獲取百度所有數(shù)據(jù)
stock_name = "BIDU"
baiduStock = init_stock(stock_name)
baiduStock.head()
print("baiduStock共計{}條。".format(len(baiduStock)))
min_date = min(baiduStock['Date'])
max_date = max(baiduStock['Date'])
print("百度的股票數(shù)據(jù)從{}到{}。".format(min_date, max_date))
print(type(baiduStock))
baiduStock.to_csv("baiduStock.csv", index=False)
baidu_df = pd.read_csv("baiduStock.csv")
baidu_df.head()
#數(shù)據(jù)可視化
def plot_basic_stock_history(df, start_date, end_date, stock_name):
stats_Ajd_Close = 'Adj. Close'
stat_min = min(df[stats_Ajd_Close])
stat_max = max(df[stats_Ajd_Close])
stat_mean = np.mean(df[stats_Ajd_Close])
date_stat_min = df[df[stats_Ajd_Close] == stat_min]['Date']
date_stat_min = date_stat_min[date_stat_min.index[0]].date()
date_stat_max = df[df[stats_Ajd_Close] == stat_max]['Date']
date_stat_max = date_stat_max[date_stat_max.index[0]].date()
print("{}在{}最小,價格是:{}美元。".format(stats_Ajd_Close, date_stat_min, stat_min))
print("{}在{}最高,價格是:{}美元。".format(stats_Ajd_Close, date_stat_max, stat_max))
print("{}在{}當前價格是:{}美元。".format(stats_Ajd_Close, end_date.date(), df.loc[df.index[-1], 'Adj. Close']))
plt.style.use("default")
plt.plot(df["Date"],
df[stats_Ajd_Close],
color='r',
linewidth=3,
label=stats_Ajd_Close)
plt.xlabel("Date")
plt.ylabel("US $")
plt.title("{} Stock History".format(stock_name))
plt.grid()
plt.show()
start_date = min_date
end_date = max_date
plot_basic_stock_history(baiduStock, start_date, end_date, stock_name)
#計算購買的股票收益
def plot_potential_profit(df,
start_date,
end_date,
stock_name,
line_color,
text_color,
myshares=1):
start_price = float(df[df["Date"] == start_date]["Adj. Open"])
end_price = float(df[df["Date"] == end_date]["Adj. Close"])
df["profits"] = (df["Adj. Close"] - start_price) * myshares
total_hold_profit = (end_price - start_price) * myshares
print("從{}到{},購買{}股,總收益是:{}美元。".format(start_date.date(),
end_date.date(),
myshares,
total_hold_profit))
plt.style.use("default")
plt.plot(df["Date"], df["profits"], color=line_color, linewidth=3)
plt.xlabel("Date")
plt.ylabel("Profit $")
plt.title("My Shares From {} to {} on {}.".format(start_date.date(), end_date.date(), stock_name))
text_location_x = (end_date - pd.DateOffset(months=1)).date()
text_location_y = total_hold_profit + (total_hold_profit / 40)
plt.text(text_location_x,
text_location_y,
"${}".format(int(total_hold_profit)),
color=text_color,
size=15)
plt.grid()
plt.show()
start_date = min_date
end_date = max_date
plot_potential_profit(baiduStock, start_date, end_date, stock_name, 'm', 'g', 100)
# 倘若在2012年到2013年之間持股的話,差不多就會虧損一半哦,可是誰又知道了?他們最后漲了那么多
start_date = pd.to_datetime("2012-08-07")
end_date = pd.to_datetime("2013-03-05")
baiduStockLowerPricePhase = baiduStock[
(baiduStock['Date'] >= start_date.date()) &
(baiduStock['Date'] <= end_date.date())
]
plot_potential_profit(baiduStockLowerPricePhase, start_date, end_date, stock_name, 'c', 'r', 100)
#訓練和評估模型
def train_model(stock_history, days=0, weekly_seasonality=False, monthly_seasonality=False):
model = fbprophet.Prophet(daily_seasonality=False,
weekly_seasonality=False,
yearly_seasonality=True,
changepoint_prior_scale=0.05)
if monthly_seasonality:
model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
model.fit(stock_history)
future = model.make_future_dataframe(periods=days)
future = model.predict(future)
return model, future
def create_prophet_model(df,
stock_name,
days=0,
weekly_seasonality=False,
monthly_seasonality=False):
stock_history = df[df["Date"] > (max_date - pd.DateOffset(years=3)).date()]
model, future = train_model(stock_history, days, weekly_seasonality, monthly_seasonality)
plt.style.use("default")
fig, ax = plt.subplots(1, 1)
fig.set_size_inches(10, 5)
# 繪制真實的值
ax.plot(stock_history['ds'],
stock_history['y'],
'v-',
linewidth=1.0,
alpha=0.8,
ms=1.8,
label='Observations')
# 繪制預測的值
ax.plot(future['ds'],
future['yhat'],
'o-',
linewidth=1.,
label='Modeled')
# 使用帶狀繪制一個不確定的區(qū)間值
ax.fill_between(future['ds'].dt.to_pydatetime(),
future['yhat_upper'],
future['yhat_lower'],
alpha=0.3,
facecolor='g',
edgecolor='k',
linewidth=1.0,
label='Confidence Interval')
plt.legend(loc=2, prop={'size': 10})
plt.title("{} Historical and Modeled Stock Price".format(stock_name))
plt.xlabel('Date')
plt.ylabel('Price $')
plt.grid(linewidth=0.6, alpha=0.6)
plt.show()
return model, future
baiduStock["ds"] = baiduStock['Date']
baiduStock["y"] = baiduStock['Adj. Close']
model, future_data = create_prophet_model(baiduStock, stock_name, monthly_seasonality=True)
model.plot_components(future_data)
plt.show()
model, future_data = create_prophet_model(baiduStock, stock_name, weekly_seasonality=True, monthly_seasonality=True)
model.plot_components(future_data)
plt.show()
#股票預測,基于時間序列預測未來180天的百度股票價格
model, future = create_prophet_model(baiduStock, stock_name, days=180)
#股票買入策略
import prophet_evaluator
baiduStock["ds"] = baiduStock['Date']
baiduStock["y"] = baiduStock['Adj. Close']
prophet_evaluator.evaluator(baiduStock, min_date, max_date, train_model, stock_name, 1000)
影評的情感分析
情感分析在自然語言處理(NLP)領域是很復雜 的,有主觀的,也有客觀的?;诋斍碍h(huán)境,針對不同的人或物,我們應該做出什么樣的情感反應。下面講解如何通過分析情感文本數(shù)據(jù),預測出說話者在當時的情況下的情緒狀態(tài)是積極的,還是消極的。
生活中就有很多例子,比如在京東、淘寶等電商 平臺購物后,用戶都會被請求對收到的貨物進行拍 照、點贊、評論和評價星級等。平臺收集這些數(shù)據(jù)后 去做情感分析,從而通過了解買家對于產(chǎn)品的喜好和滿意度來改善產(chǎn)品和服務。這為平臺提供了一些潛在 的用戶會購買哪些產(chǎn)品的數(shù)據(jù)。
下面使用循環(huán)神經(jīng)網(wǎng)絡(RNN)來編寫該神經(jīng)網(wǎng)絡模型的代碼,創(chuàng)建此網(wǎng)絡模型會使用到長短期記憶 網(wǎng)絡(LSTM)和嵌入層(Embedding Layers),最后的輸出層會使用sigmoid激活函數(shù),因為我們預測的結果要么是積極的,要么是消極的。
數(shù)據(jù)文件在這:
鏈接:https://pan.baidu.com/s/1DQdAROwzOT6nXdWBYeT2bw 密碼:1rn7
基于TensorFlow
import numpy as np
import tensorflow as tf
# 定義加載數(shù)據(jù)的函數(shù)
def loadData():
# 加載評論(字符串)
with open('reviews.txt', 'r') as f:
reviews = f.read()
# 加載評論(字符串)的對應標簽,是積極的還是消極的
with open('labels.txt', 'r') as f:
labels = f.read()
# 返回評論和標簽
return reviews, labels
# 調(diào)用函數(shù)
reviews, labels = loadData()
# 查看評論的前150個字符是什么
reviews[:150]
# 查看評論的對應標簽的前150個字符是什么
labels[:150]
from string import punctuation
# 定義數(shù)據(jù)預處理函數(shù)
def dataPreprocess(reviews_str):
# 通過列表推導式將reviews_str字符串里的包含各種標點符號去掉,并返回一個字符組成的數(shù)組
# 然后通過join()函數(shù)將數(shù)組里的元素都連接成一個長長的字符串
all_text = ''.join(
[review for review in reviews_str if review not in punctuation])
# 將該字符串通過n換行符分割成數(shù)組
review_list = all_text.split('n')
# 將數(shù)組里的元素通過空格連接起來,形成一個長長的字符串
all_text = ' '.join(review_list)
# 然后通過使用split()函數(shù)的默認分隔符-空格來將字符串分割成一個個單詞的數(shù)組
words = all_text.split()
return review_list, all_text, words
# 調(diào)用函數(shù)
reviews, all_text, words = dataPreprocess(reviews)
reviews[:2]
# 查看前20個元素(單詞)
words[:20]
# 查看前150個字符串
all_text[:150]
# 單詞編碼
from collections import Counter
# 統(tǒng)計單詞的重復個數(shù)
word_counter = Counter(words)
# 將變量word_counter根據(jù)默認順序進行逆序排序(從大到?。?,使用sorted方法,逆序設置參數(shù)reverse=True
sorted_vocab = sorted(word_counter, key=word_counter.get, reverse=True)
# 定義顯示前10個單詞以及它的重復個數(shù)的函數(shù)
def showTop10Item(dict_obj):
word_index = 0
for k, v in dict_obj.items():
if word_index >= 10:
break
print("{}:{}".format(k, v))
word_index+=1
# 顯示變量word_counter里的單詞和它對應的數(shù)量
showTop10Item(word_counter)
# 按照單詞出現(xiàn)的數(shù)量從大到小的排序,查看前15個單詞的出現(xiàn)次數(shù)
word_counter.most_common(15)
# 查看排序后的前15個單詞,和上面顯示的結果一樣
sorted_vocab[:15]
# 創(chuàng)建單詞對應的索引關系字典
vocab_to_int = {word: i for i, word in enumerate(sorted_vocab, 1)}
# 然后顯示前10個單詞以及它的個數(shù)
showTop10Item(vocab_to_int)
# 將每個單詞的索引位置取出來,然后添加到reviews_ints數(shù)組里
# 也就是說,現(xiàn)在字符串里的每個單詞,不是原來的單詞字符串了,而是一個數(shù)值,表示它的索引
reviews_ints = []
for review in reviews:
reviews_ints.append([vocab_to_int[word] for word in review.split()])
print(reviews_ints[:1])
len(reviews_ints)
# 標簽編碼
# 對positive進行編碼為1,negative為0
labels = labels.split('n')
labels = np.array([1 if label == 'positive' else 0 for label in labels])
# 查看前10個編碼標簽值
labels[:10]
from collections import Counter
review_lens = Counter([len(x) for x in reviews_ints])
print("評論的最小長度是: {}".format(review_lens[0]))
print("評論的最大長度是: {}".format(max(review_lens)))
# 過濾掉評論的字符串長度為0的情況,并返回長度非零的索引,形成數(shù)組并返回
non_zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]
# 去掉字符串長度為0的情況后,還有多少個評論
print(len(non_zero_idx))
# 通過變量non_zero_idx索引數(shù)組,過濾掉變量reviews_ints里的字符串為0的情況
reviews_ints = [reviews_ints[i] for i in non_zero_idx]
# 過濾掉由于上面的字符串長度為0的那一行評論后,它對應的標簽也需要過濾掉
labels = np.array([labels[i] for i in non_zero_idx])
# 現(xiàn)在,我們要創(chuàng)建一個features的變量來作為特征向量(Feature Vector),這個數(shù)據(jù)就是我們要傳遞到神經(jīng)網(wǎng)絡中的,
# 數(shù)據(jù)來自于reviews_ints變量。因為我們要傳遞整型的數(shù)值到神經(jīng)網(wǎng)絡中,且每行的數(shù)值不能
# 超過200個;所以就是,不足200長度的評論,前面使用0來填充;超過200長度的,我們截斷前
# 200個字符串的長度。
# 定義一個評論的字符串最大長度是200
seq_len = 200
# 創(chuàng)建一個矩陣,里面的值都默認是0
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
# 將reviews_ints里的值都截斷在200的長度,并填充到變量features里。
# 不足200長度的,就是它本身長度
for i, row in enumerate(reviews_ints):
# 評論長度不足200的,我們在前面使用0來填充
features[i, -len(row):] = np.array(row)[:seq_len]
# 查看第一個
features[0:1]
features.shape
# 拆分訓練集、驗證集和測試集數(shù)據(jù)
# 定義80%的數(shù)據(jù)用于訓練
split_train_ratio = 0.8
# 特征向量的長度
features_len = len(features)
# 訓練集的個數(shù)
train_len = int(features_len * split_train_ratio)
# 分割出訓練集和驗證集的數(shù)據(jù)
train_x, val_x = features[:train_len], features[train_len:]
train_y, val_y = labels[:train_len], labels[train_len:]
# 將驗證集的數(shù)量折半
val_x_half_len = int(len(val_x) / 2)
# 將驗證集數(shù)據(jù)分成一半驗證集,另一半測試集
val_x, test_x = val_x[:val_x_half_len], val_x[val_x_half_len:]
val_y, test_y = val_y[:val_x_half_len], val_y[val_x_half_len:]
# 輸出打印
print("tttFeature Shapes:")
print("Train set: tt{}".format(train_x.shape),
"nValidation set: t{}".format(val_x.shape),
"nTest set: tt{}".format(test_x.shape))
# 定義超參數(shù)
lstm_size = 256
lstm_layers = 2
batch_size = 512
learning_rate = 0.01
# 獲取單詞的總長度
n_words = len(vocab_to_int) + 1
# 創(chuàng)建默認計算圖對象
tf.reset_default_graph()
# 給計算圖上的張量的輸入占位符添加一個前綴inputs
with tf.name_scope('inputs'):
# 輸入特征占位符
inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
# 輸入標簽占位符
labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
# 保留率占位符
keep_prob = tf.placeholder(tf.float32, name="keep_prob")
# 嵌入向量的大小
embed_size = 300
# 給計算圖上的張量的嵌入層變量和查找表添加一個前綴Embeddings
with tf.name_scope("Embeddings"):
# 均勻分布初始化嵌入層的變量,范圍是-1到1之間
embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
# 將輸入特征占位符傳入嵌入查找表
embed = tf.nn.embedding_lookup(embedding, inputs_)
def lstm_cell():
# 創(chuàng)建基礎LSTM cell
lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
# 添加dropout層到cell上
return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
# 給graph上的tensors的RNN層添加一個前綴RNN_layers
with tf.name_scope("RNN_layers"):
# 創(chuàng)建多個LSTM層
cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
# 獲取一個初始化狀態(tài),默認值都是0
initial_state = cell.zero_state(batch_size, tf.float32)
with tf.name_scope("RNN_forward"):
# 通過dynamic_rnn可以返回每一步的輸出和隱藏層的最后狀態(tài)
outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
with tf.name_scope('predictions'):
# 創(chuàng)建輸出層,由于我們預測的輸出是1或者0,所以sigmoid激活函數(shù)是最好的選擇
predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
with tf.name_scope('cost'):
# 定義均方差訓練損失函數(shù)
cost = tf.losses.mean_squared_error(labels_, predictions)
with tf.name_scope('train'):
# 定義訓練優(yōu)化器
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
with tf.name_scope('validation'):
# 計算驗證精確度
correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# 定義獲取數(shù)據(jù)批次的生成器函數(shù)
def get_batches(x, y, batch_size=100):
# 計算得出有多少個批次,這里是整除,所以假如x的總數(shù)不能被batch_size整除,
# 那么會剩下很小的一部分數(shù)據(jù)暫時會被丟棄
n_batches = len(x)//batch_size
# 然后再次確定x和y的數(shù)據(jù)集的數(shù)據(jù)
x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
# 通過for循環(huán),使用yield關鍵字構建生成器函數(shù)
for ii in range(0, len(x), batch_size):
yield x[ii:ii+batch_size], y[ii:ii+batch_size]
# 設置迭代次數(shù),8次
epochs = 8
# 創(chuàng)建檢查點保存對象
saver = tf.train.Saver()
# 創(chuàng)建一個TensorFlow會話
with tf.Session() as sess:
# 初始化全局變量
sess.run(tf.global_variables_initializer())
iteration = 1
# 開始迭代
for e in range(epochs):
# 首次計算初始化狀態(tài)
state = sess.run(initial_state)
# 將所有的數(shù)據(jù)都進行訓練,get_batches()函數(shù)會獲取數(shù)據(jù)生成器,然后進行迭代
for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
feed = {inputs_: x,
labels_: y[:, None],
keep_prob: 0.5,
initial_state: state}
loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
# 每訓練5次時,打印一次訓練日志
if iteration%5==0:
print("Epoch: {}/{}".format(e, epochs),
"Iteration: {}".format(iteration),
"Train loss: {:.3f}".format(loss))
# 每訓練25次時,打印一次驗證日志
if iteration%25==0:
val_acc = []
val_state = sess.run(cell.zero_state(batch_size, tf.float32))
# 對驗證集的所有數(shù)據(jù)進行計算分值
for x, y in get_batches(val_x, val_y, batch_size):
feed = {inputs_: x,
labels_: y[:, None],
keep_prob: 1,
initial_state: val_state}
batch_acc, val_state =
sess.run([accuracy, final_state], feed_dict=feed)
# 每25次訓練后,完全的驗證一次,得到驗證分值,保存在數(shù)組val_acc里,
val_acc.append(batch_acc)
# 打印每25次訓練后,驗證的均值
print("Val acc: {:.3f}".format(np.mean(val_acc)))
iteration +=1
# 每批次時都記錄檢查點
saver.save(sess, "checkpoints/sentiment.ckpt")
# 當所有的數(shù)據(jù)迭代訓練完畢后,最后記錄一次檢查點
saver.save(sess, "checkpoints/sentiment.ckpt")
test_acc = []
with tf.Session() as sess:
# 從檢查點恢復已訓練的模型
saver.restore(sess, "checkpoints/sentiment.ckpt")
# 在計算測試集數(shù)據(jù)前,先創(chuàng)建一個空的狀態(tài)
test_state = sess.run(cell.zero_state(batch_size, tf.float32))
# 獲取測試集數(shù)據(jù)生成器
for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
feed = {inputs_: x,
labels_: y[:, None],
keep_prob: 1,
initial_state: test_state}
# 開始批次計算測試集數(shù)據(jù)
batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
# 將每個批次的得分保存到數(shù)組
test_acc.append(batch_acc)
# 最后輸出測試得分均值,即精確度
print("Test accuracy: {:.3f}".format(np.mean(test_acc)))
基于Keras
#基于Keras
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# 為了確??蓮同F(xiàn)性,我們設置一個隨機種子
numpy.random.seed(7)
# 設置5000的意思是,只保留前面5000個以內(nèi)常見的單詞,其它的都為0
top_words = 5000
# 加載數(shù)據(jù)集
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# 設置單個影評的最大長度是500
review_max_length = 500
# 影評長度不夠500的用0填充,超過500的截斷
X_train = sequence.pad_sequences(X_train, maxlen=review_max_length)
X_test = sequence.pad_sequences(X_test, maxlen=review_max_length)
# 創(chuàng)建模型
embedding_vecor_length = 32
model = Sequential()
# 添加輸入嵌入層
model.add(Embedding(top_words, embedding_vecor_length, input_length=review_max_length))
# 添加LSTM隱藏層
model.add(LSTM(100))
# 添加輸出層(全連接層),二分類問題,使用sigmoid激活函數(shù)
model.add(Dense(1, activation='sigmoid'))
# 編譯模型,二分類問題,使用二進制交叉熵來計算損失
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 輸出顯示模型架構
model.summary()
# 訓練模型,所有的訓練數(shù)據(jù)集都要經(jīng)過3次訓練,每次訓練時的每批次大小是64個
model.fit(X_train, y_train, epochs=3, batch_size=64)
# 最后評估模型
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {}".format((scores[1]*100)))