機械学習ことはじめ（Vol.75掲載）

著者：川嶋宏彰

本連載では、機械学習の基礎となるさまざまな手法の仕組みや、それらの手法のPythonでの利用方法を解説していきます。今回は、入力データがどのようなクラス（カテゴリ）であるかを予測する分類の問題を扱います。

シェルスクリプトマガジン Vol.75は以下のリンク先でご購入できます。

図2　データセットの読み込みから、散布図のプロットまでを実施するPythonコード

import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['font.size'] = 14
penguins = sns.load_dataset('penguins')
# 取り出す特徴量
features = ['bill_depth_mm', 'body_mass_g']  # （★）
# features = ['bill_depth_mm', 'bill_length_mm']  # 取り出す特徴量を変える
# 対象とするペンギン種
target_species = ['Adelie', 'Gentoo']  # （★）
# target_species = ['Adelie', 'Chinstrap', 'Gentoo']  # 3種のペンギンにする
# 今回用いる特徴量をクラスラベルと共に取り出す
df = penguins[['species'] + features].copy()
df.dropna(inplace=True)  # NaN が含まれる行は削除
# 今回用いるペンギン種のみとする
df2 = df[df['species'].isin(target_species)].copy()
print(df2.shape)  # (274, 3) と表示される
# 現在のパレットから用いる種 (target_species) に合わせたパレットを作成
palette = {c: sns.color_palette()[k] for k, c
           in enumerate(df['species'].unique()) if c in target_species}
fig = plt.figure(figsize=(5, 5))
sns.scatterplot(data=df2, x=features[0], y=features[1],
                hue='species', palette=palette)
plt.show()

図3　ロジスティック回帰の準備をするPythonコード

from sklearn.linear_model import LogisticRegression
import numpy as np

X = df2[features].values   # 各個体の2次元特徴量（行数 = 個体数）
y = df2['species'].values  # 各個体のクラスラベル
clf_lr = LogisticRegression(penalty='none')  # 分類モデルの準備

図4　特徴量一つ（体重）のみでロジスティック回帰をするPythonコード

f_idx = 1  # 用いる特徴量のインデックス
# 指定した列（特徴量）だけ取り出して学習
clf_lr.fit(X[:, [f_idx]], y)
# 学習されたパラメータを表示
w0 = clf_lr.intercept_[0]
w1 = clf_lr.coef_[0][0]
print(f'w_0 = {w0:.2f}, w_1 = {w1:.4f}, b = -w0/w1 = {-w0/w1:.2f}')
# 予測値および分類結果の可視化用グリッド
xmin = np.min(X[:, f_idx])
xmax = np.max(X[:, f_idx])
xs = np.linspace(xmin - (xmax - xmin) / 10,
                 xmax + (xmax - xmin) / 10, 100)  # 等間隔に100点用意
fig = plt.figure(figsize=(10, 5))
# 元データのプロット
sns.scatterplot(x=X[:, f_idx], y=(y=='Gentoo'), hue=y, palette=palette)
# 予測値（事後確率）の曲線をプロット
y_pred = clf_lr.predict_proba(xs.reshape(-1, 1))[:, 1]
plt.plot(xs, y_pred, c=palette['Gentoo'])
plt.ylim(-0.1, 1.1)
plt.xlabel(f'x ({features[f_idx]})')
plt.ylabel('y')
plt.grid()
plt.show()

図6　特徴量二つ（くちばしの高さと体重）でロジスティック回帰をするPythonコード

import matplotlib as mpl
from sklearn.preprocessing import StandardScaler

def plot_decision_boundary(X, y, clf, xylabels=None, palette=None, fig=None, ngrid=50):
  """ 分類器 clf の決定境界を描画 """
  if fig is None: fig = plt.figure(figsize=(5, 5))
  else: plt.figure(fig.number)
  # 2次元空間にグリッド点を準備
  xmin = X.min(axis=0)  # 各列の最小値
  xmax = X.max(axis=0)  # 各列の最大値
  xstep = [(xmax[j]-xmin[j]) / ngrid for j in range(2)]  # グリッドのステップ幅
  xmin = [xmin[j] - 8*xstep[j] for j in range(2)]  # 少し広めに
  xmax = [xmax[j] + 8*xstep[j] for j in range(2)]
  aranges = [np.arange(xmin[j], xmax[j] + xstep[j], xstep[j]) for j in range(2)]
  x0grid, x1grid = np.meshgrid(*aranges)
  # 各グリッド点でクラスを判定
  y_pred = clf.predict(np.c_[x0grid.ravel(), x1grid.ravel()])
  y_pred = y_pred.reshape(x0grid.shape)  # 2次元に
  y_pred = np.searchsorted(np.unique(y_pred), y_pred)  # 値をインデックスに
  clist = palette.values() if type(palette) is dict else palette
  cmap = mpl.colors.ListedColormap(clist) if palette is not None else None
  plt.contourf(x0grid, x1grid, y_pred, alpha=0.3, cmap=cmap)
  sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, palette=palette)
  plt.legend()
  plt.xlim([xmin[0], xmax[0]])
  plt.ylim([xmin[1], xmax[1]])
  if xylabels is not None:
    plt.xlabel(xylabels[0])
    plt.ylabel(xylabels[1])
  return fig

Xs = StandardScaler().fit_transform(X)  # 標準化によるスケーリング
clf_lr.fit(Xs, y)  # 二つの特徴量を両方用いて学習
print('w0:', clf_lr.intercept_)
print('w1, w2:', clf_lr.coef_)
xylabels = [s + ' (standardized)' for s in features]  # 軸ラベル変更
plot_decision_boundary(Xs, y, clf_lr, xylabels, palette, ngrid=100)  # 決定境界
plt.show()

図9　ロジスティック回帰による各点での予測値をプロットするPythonコード

from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d

# メッシュで
def gen_2dgrid(X):
  """ 2次元メッシュグリッドの生成 """
  d = X.shape[1]
  xmin = X.min(axis=0)  # 各列の最小値
  xmax = X.max(axis=0)  # 各列の最大値
  xstep = [(xmax[j]-xmin[j]) / 100 for j in range(d)]  # グリッドのステップ幅
  xmin = [xmin[j] - 10*xstep[j] for j in range(d)]  # 少し広めに
  xmax = [xmax[j] + 10*xstep[j] for j in range(d)]
  aranges = [np.arange(xmin[j], xmax[j] + xstep[j], xstep[j]) for j in range(2)]
  return np.meshgrid(*aranges)  # d=2のときは (x0grid, x1grid) が返る
# 2次元の場合の予測値 (proba)
def proba_2dgrid(clf, x0grid, x1grid):
  """ 2次元メッシュの各点での予測値を計算 """    
  return clf.predict_proba(np.c_[x0grid.ravel(), x1grid.ravel()])[:, 1]
                          .reshape(x0grid.shape)
xgrid_2d = gen_2dgrid(Xs)  # xgrid_2d: (x0grid, x1grid) のような二つ組
proba_2d = proba_2dgrid(clf_lr, *xgrid_2d)  # 予測値
# 3次元プロット
fig = plt.figure(figsize=(14, 8))
ax = fig.add_subplot(111, projection='3d')
cmap = cm.winter  # カラーマップを設定
ax.plot_surface(*xgrid_2d, proba_2d, cmap=cmap)
# ax.set_box_aspect((3, 3, 1))  # matplotlib のバージョンが3.3以上で利用可能
ax.set_zlim(-0.1, 1.1)
ax.set_xlabel(xylabels[0])
ax.set_ylabel(xylabels[1])
ax.set_zlabel('y')
ax.set_zticks([0, 0.5, 1.0])
ax.view_init(elev=60, azim=-120)  # 3次元プロットの表示角度の設定
plt.show()

図12　線形SVM による学習と決定境界の表示をするPythonコード

from sklearn.svm import SVC

C = 10  # 大きいほどハードマージンに近い
clf_lin = SVC(kernel='linear', C=C)
clf_lin.fit(Xs, y)
# 決定境界とサポートベクターを可視化
plot_decision_boundary(Xs, y, clf_lin, xylabels, palette, ngrid=100)
plt.scatter(clf_lin.support_vectors_[:, 0], clf_lin.support_vectors_[:, 1],
            s=50, facecolors='none', edgecolors='r')
plt.title(f'SVM(linear) C = {C}')
plt.show()

図17　非線形SVMとパラメータCの関係を示すPythonコード

from sklearn.datasets import make_moons

X_moon, y_moon = make_moons(n_samples=100, noise=0.2, random_state=6)
palette_o = {k: sns.color_palette()[k] for k in range(2)}
# moonデータの散布図をプロット
plt.figure(figsize=(5, 5))
sns.scatterplot(x=X_moon[:, 0], y=X_moon[:, 1],
                hue=y_moon, palette=palette_o)
plt.show()
# 異なるCでSVM（RBFカーネル）の学習と決定境界の表示
for C in [0.1, 10, 10000]:
  clf_rbf = SVC(C=C, kernel='rbf')  # kernel='rbf'は省略可能
  clf_rbf.fit(X_moon, y_moon)
  # 決定境界をプロット
  plot_decision_boundary(X_moon, y_moon,
                         clf_rbf, None, palette_o, ngrid=100)
  plt.title(f'SVM(rbf) C = {C}')
  plt.show()

図20　グリッドサーチをするPythonコード

from sklearn.model_selection import cross_val_score

best_score = -1
for gamma in [0.1, 0.5, 1, 2, 10]:  # 5通り
  for C in [0.1, 1, 10, 100, 1000, 10000]:  # 6通り
    print(f'gamma: {gamma},\tC: {C}', end='')
    svm = SVC(gamma=gamma, C=C)
    cv_scores = cross_val_score(svm, X_moon, y_moon, cv=5)
    score = np.mean(cv_scores)
    print(f'\t | average: {score:.2} <- {cv_scores}')
    if score > best_score:
      best_score = score
      best_params = {'C': C, 'gamma': gamma}
print('best_score:', best_score)
print('best_params:', best_params)
clf_rbf = SVC(**best_params)
clf_rbf.fit(X_moon, y_moon)
plot_decision_boundary(X_moon, y_moon,
                       clf_rbf, None, palette_o, ngrid=100)
title = f"C = {best_params['C']}, gamma = {best_params['gamma']}"
plt.title(title)
plt.show()