Accompanies https://arxiv.org/abs/2006.13311
These are code snippets that were included to show the developments of machine learning tools in the 2010s. The strides Python and the machine learning community has made with a consolidated ML API in scikit-learn.
In the following, we use the utility functions of scikit-learn to prepare a classification data set and split the data into train and test sets.
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=5000, n_features=5,
n_informative=3, n_redundant=0,
random_state=0, shuffle=False)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)
Train a SVM, predict, score, and evaluate feature importance.
from sklearn.svm import SVC
svm = SVC(random_state=0, probability=True)
svm.fit(X_train, y_train)
print(svm.predict([[0, 0, 0, 0, 0], [-1, -1, -1, -1, -1], [1, 1, 1, 1, 1]]))
print(svm.score(X_train, y_train))
print(svm.score(X_test, y_test))
from sklearn.inspection import permutation_importance
importances = permutation_importance(svm, X_train, y_train, n_repeats=10, random_state=0)
print(importances.importances_mean)
print(importances.importances_mean.argsort())
Train a Random Forest, predict, score, and evaluate feature importance.
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=7, random_state=0)
rf.fit(X_train, y_train)
print(rf.predict([[0, 0, 0, 0, 0], [-1, -1, -1, -1, -1], [1, 1, 1, 1, 1]]))
print(rf.feature_importances_)
print(rf.feature_importances_.argsort())
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))
Train two Gaussian Processes with different kernels, predict, score, and evaluate feature importance.
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import DotProduct as DP
gp = GaussianProcessClassifier(random_state=0, copy_X_train=False)
gp0 = GaussianProcessClassifier(kernel=DP(.25), random_state=0, copy_X_train=False)
gp.fit(X_train[:1000], y_train[:1000])
gp0.fit(X_train[:1000], y_train[:1000])
print(gp.predict([[0, 0, 0, 0, 0], [-1, -1, -1, -1, -1], [1, 1, 1, 1, 1]]))
print(gp0.predict([[0, 0, 0, 0, 0], [-1, -1, -1, -1, -1], [1, 1, 1, 1, 1]]))
print(gp.score(X_train, y_train))
print(gp.score(X_test, y_test))
print(gp0.score(X_train, y_train))
print(gp0.score(X_test, y_test))
importances = permutation_importance(gp0, X_train, y_train, n_repeats=10, random_state=0)
print(importances.importances_mean)
print(importances.importances_mean.argsort())
importances = permutation_importance(gp, X_train, y_train, n_repeats=10, random_state=0)
print(importances.importances_mean)
print(importances.importances_mean.argsort())
Train a deep neural network, predict, and score accuracy. Then train the same model 10 times with different initializations to evaluate the convergence of the model.
import tensorflow as tf
tf.random.set_seed(900)
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dropout(.3),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(2, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
model.fit(X_train,
y_train,
validation_split=.1,
epochs=100)
model.evaluate(X_test, y_test)
Here we train ten models on ten initializations to evaluate the convergence of the model.
import numpy as np
seeds = 10
train_loss = np.zeros((seeds,100))
val_loss = np.zeros((seeds,100))
train_acc = np.zeros((seeds,100))
val_acc = np.zeros((seeds,100))
for i in range(seeds):
tf.keras.backend.clear_session()
del model
tf.random.set_seed(i*100)
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dropout(.3),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(2, activation='softmax')
])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
history = model.fit(X_train,
y_train,
validation_split=.1,
epochs=100,
verbose=0)
train_loss[i,:] = history.history['loss']
val_loss[i,:] = history.history['val_loss']
train_acc[i,:] = history.history['accuracy']
val_acc[i,:] = history.history['val_accuracy']
train_loss_u = np.mean(train_loss, axis=0)
train_loss_s = np.std(train_loss, axis=0)
val_loss_u = np.mean(val_loss, axis=0)
val_loss_s = np.std(val_loss, axis=0)
train_acc_u = np.mean(train_acc, axis=0)
train_acc_s = np.std(train_acc, axis=0)
val_acc_u = np.mean(val_acc, axis=0)
val_acc_s = np.std(val_acc, axis=0)
from cycler import cycler
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.colors import ListedColormap
plt.rc('font', family='serif')
plt.rc('xtick', labelsize='x-small')
plt.rc('ytick', labelsize='x-small')
plt.rc('axes', prop_cycle=(cycler(color=['#67001f', '#053061'])))
# summarize history for accuracy
fig, axs = plt.subplots(figsize=(8,3), ncols=2)
x = list(range(100))
axs[1].fill_between(x, train_acc_u-train_acc_s*2, train_acc_u+train_acc_s*2,
alpha=.2, facecolor='C0',)
axs[1].fill_between(x, val_acc_u-val_acc_s*2, val_acc_u+val_acc_s*2,
alpha=.2, facecolor='C1',)
axs[1].plot(x,train_acc_u, color='C0', linewidth=1)
axs[1].plot(x,val_acc_u, color='C1', linewidth=1)
axs[1].set_title('Neural Network Accuracy')
axs[1].set_ylabel('Metric: Accuracy')
axs[1].set_xlabel('Training Epoch')
# summarize history for loss
axs[0].fill_between(x, train_loss_u-train_loss_s*2, train_loss_u+train_loss_s*2,
alpha=.2, facecolor='C0')
axs[0].fill_between(x, val_loss_u-val_loss_s*2, val_loss_u+val_loss_s*2,
alpha=.2, facecolor='C1',)
axs[0].plot(x,train_loss_u, color='C0', linewidth=1)
axs[0].plot(x,val_loss_u, color='C1', linewidth=1)
axs[0].set_title('Neural Network Loss')
axs[0].set_ylabel('Loss: Cross-Entropy')
axs[0].set_xlabel('Training Epoch')
axs[1].legend(['Training Data', 'Validation Data', '2σ contains 95%', '2σ contains 95%'], loc='best')
fig.subplots_adjust(left=0.01)
fig.savefig('nn-loss.png', dpi=300, bbox_inches='tight')
plt.show()
classifiers = {"SVM": svm, "Gaussian Process": gp, "Random Forest": rf, "DNN": model}
# Create a mesh to calculate all predictions on
h = .02 # step size in the mesh
x_min, x_mean, x_max = X[:, 0].min() - .5, np.mean(X[:, 0]), X[:, 0].max() + .5
y_min, y_mean, y_max = X[:, 1].min() - .5, np.mean(X[:, 1]), X[:, 1].max() + .5
z_min, z_mean, z_max = X[:, 2].min() - .5, np.mean(X[:, 2]), X[:, 2].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
The following function plots the decision surfaces with their respective inputs.
def sklearn_slice(classifiers, offset, h=0.02, out_file=None):
""" Plot input data and 2D decision surfaces of 3D data
classifiers (dict): Different classifiers to portray. Key will be used as
title and value should be the fitted model.
offset (float): Position of 2D slice withing the 3D volume
h (float): Size of 2D slice
out_file (str): Filename of image to save to.
"""
i = 1
#z_slice = (z_min, z_max) #all
z_slice = (offset-h, offset+h) #specific slice
z_bool = (X[:,2] > z_slice[0]) * (X[:,2] < z_slice[1])
z_train_bool = (X_train[:,2] > z_slice[0]) * (X_train[:,2] < z_slice[1])
z_test_bool = (X_test[:,2] > z_slice[0]) * (X_test[:,2] < z_slice[1])
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
figure = plt.figure(figsize=(4 * (len(classifiers) + 1), 4))
ax = plt.subplot(1, len(classifiers) + 1, i)
ax.set_title("Input data")
# Plot all data points
all_plot = ax.scatter(X[np.logical_not(z_bool), 0], X[np.logical_not(z_bool), 1],
facecolor='#C0C0C0', label="All Data")
# Plot the training points
train_plot = ax.scatter(X_train[z_train_bool, 0], X_train[z_train_bool, 1], c=y_train[z_train_bool],
cmap=cm_bright, edgecolors='k', label="Training Data")
# Plot the testing points
test_plot = ax.scatter(X_test[z_test_bool, 0], X_test[z_test_bool, 1], c=y_test[z_test_bool],
cmap=cm_bright, marker='x', label="Test Data")
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
grey_patch = mpatches.Patch(color='#C0C0C0', label='All Data (3D)')
red_patch = mpatches.Patch(color='#FF0000', label='Class 0')
blue_patch = mpatches.Patch(color='#0000FF', label='Class 1')
ax.legend(handles=[grey_patch, red_patch, blue_patch, train_plot, test_plot], loc='best')
i += 1
# iterate over classifiers
for name, clf in classifiers.items():
ax = plt.subplot(1, len(classifiers) + 1, i)
try:
score = clf.score(X_test, y_test)
except:
score = clf.evaluate(X_test, y_test, verbose=0)[1]
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
# if hasattr(clf, "decision_function"):
# Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel(), np.zeros_like(yy.ravel())+offset, np.zeros_like(yy.ravel()), np.zeros_like(yy.ravel())])
# elif hasattr(clf, "predict_classes"):
zrs = np.zeros_like(yy.ravel())
if hasattr(clf, "predict_classes"):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel(), zrs+offset, zrs, zrs], verbose=0)[:, 1]
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel(), zrs+offset, zrs, zrs])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot the training points
ax.scatter(X_train[z_train_bool, 0], X_train[z_train_bool, 1], c=y_train[z_train_bool],
cmap=cm_bright, edgecolors='k')
# Plot the testing points
ax.scatter(X_test[z_test_bool, 0], X_test[z_test_bool, 1], c=y_test[z_test_bool],
cmap=cm_bright, marker='x')
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
plt.tight_layout()
if out_file is None:
plt.show()
else:
plt.savefig(out_file, bbox_inches='tight')
plt.close()
sklearn_slice(classifiers, z_mean, out_file="decision-boundaries.png")
sklearn_slice({"Linear Kernel": gp0, "RBF Kernel": gp}, z_mean, out_file="gaussian-processes.png")
# iterate over classifiers
for name, clf in classifiers.items():
sklearn_slice({name: clf}, z_mean, out_file=name+".png")
Iterate over entire volume, create an image for each, then collect all images and create a movie from it.
import imageio
from tqdm.notebook import tqdm
drivepath = "losses/"
for x, off in enumerate(tqdm(np.arange(z_min+h, z_max-h, h))):
sklearn_slice(classifiers, off, out_file=f"{drivepath}loss3D-{x:02d}.png")
images = []
for i in range(439):
images.append(imageio.imread(f"{drivepath}loss3D-{i:02d}.png"))
imageio.mimsave('decision_boundary2.mp4', images[::3])
imageio.mimsave('decision_boundary2.gif', images)