# 获取MNIST数据集
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False)# 查看测试器和标签
X, y = mnist['data'], mnist['target']
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
# 对数据进行洗牌,防止输入许多相似实例导致的执行性能不佳
import numpy as np
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]# 重新创建目标向量(以是5和非5作为二分类标准)
y_train_5 = (y_train == '5')
y_test_5 = (y_test == '5')
some_digit = X[36000]from sklearn.linear_model import SGDClassifiersgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])
# 1. 使用OvO(一对一)策略,基于SGD创建多分类器
from sklearn.multiclass import OneVsOneClassifierovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])
# 1. 训练随机森林(因为随机森林本身就可以进行多分类)
from sklearn.ensemble import RandomForestClassifierforest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])
# 1. 使用交叉验证对SGD多分类器进行评估
from sklearn.model_selection import cross_val_scorecross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")
# 2. 对训练集进行标准化,再进行评估看看
from sklearn.preprocessing import StandardScalerscaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) # 标准化
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")
# 混淆矩阵
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
# 绘制混淆矩阵的图像
import matplotlib.pyplot as pltplt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()
# 将混淆矩阵中的每个值 除以 相应类别中的图片数量
row_sums = conf_mx.sum(axis=1, keepdims=True) # 同行相加
norm_conf_mx = conf_mx / row_sums# 用0填充对角线,保存错误,重新绘制混淆矩阵
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()
上一篇:多线程与高并发(一)