機械学習ことはじめ

著者：川嶋宏彰

本連載では、機械学習の基礎となるさまざまな手法の仕組みや、それらの手法のPythonでの利用方法を解説していきます。今回は、入力から予測値を出力する「回帰モデル」と、その教師あり学習について紹介します。

シェルスクリプトマガジン Vol.74は以下のリンク先でご購入できます。

図5　気温に対するチョコレート菓子支出金額の散布図をプロットするPython コード

import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib  # pip install japanize-matplotlibが必要
import re
plt.rcParams&#91;'font.size'] = 14
# 家計調査データの読み込み
beg_col = 5  # 品目の始まる列
end_col = 6  # 品目の終わる列
usecols = &#91;3] + list(range(beg_col, end_col + 1))  # 用いる列リスト
# 年月の列をインデックスとするデータフレームに
df_estat = pd.read_csv('estat.csv', header=0, encoding='cp932',
    usecols=usecols, index_col=0, parse_dates=True, 
    date_parser=lambda x: pd.to_datetime(x, format='%Y年%m月'))
# 各品目の支出金額を各月の日数で割る（★）
for col in df_estat:
    new_col = re.sub(r'.* (.*)【円】', r'\1 (円/日)', col)
    df_estat&#91;new_col] = df_estat&#91;col] / df_estat.index.days_in_month
# 気象庁の気温データの読み込み
df_jma = pd.read_csv('jma.csv', skiprows=5, header=None, 
    encoding='cp932',
    usecols=&#91;0, 1], index_col=0, parse_dates=True)
hitemp_col = '平均最高気温 (℃)'  # 気温の列名を変える
df_jma.columns = &#91;hitemp_col]
# データフレームの結合
df = pd.merge(df_estat, df_jma, left_index=True, right_index=True)
df.index.name = 'y/m'  # インデックスの名前を変更
# 年と月の列を追加（後で利用）
df&#91;'year'] = df_estat.index.year
df&#91;'month'] = df_estat.index.month
print('df (merged):\n', df)
# 散布図をプロット
target_col = 'チョコレート菓子 (円/日)'
df.plot(kind='scatter', x=hitemp_col, y=target_col, figsize=(5, 5))
plt.show()

import pandas as pd

import matplotlib.pyplot as plt

import japanize_matplotlib # pip install japanize-matplotlibが必要

import re

plt.rcParams['font.size'] = 14

# 家計調査データの読み込み

beg_col = 5 # 品目の始まる列

end_col = 6 # 品目の終わる列

usecols = [3] + list(range(beg_col, end_col + 1)) # 用いる列リスト

# 年月の列をインデックスとするデータフレームに

df_estat = pd.read_csv('estat.csv', header=0, encoding='cp932',

usecols=usecols, index_col=0, parse_dates=True,

date_parser=lambda x: pd.to_datetime(x, format='%Y年%m月'))

# 各品目の支出金額を各月の日数で割る（★）

for col in df_estat:

new_col = re.sub(r'.* (.*)【円】', r'\1 (円/日)', col)

df_estat[new_col] = df_estat[col] / df_estat.index.days_in_month

# 気象庁の気温データの読み込み

df_jma = pd.read_csv('jma.csv', skiprows=5, header=None,

encoding='cp932',

usecols=[0, 1], index_col=0, parse_dates=True)

hitemp_col = '平均最高気温 (℃)' # 気温の列名を変える

df_jma.columns = [hitemp_col]

# データフレームの結合

df = pd.merge(df_estat, df_jma, left_index=True, right_index=True)

df.index.name = 'y/m' # インデックスの名前を変更

# 年と月の列を追加（後で利用）

df['year'] = df_estat.index.year

df['month'] = df_estat.index.month

print('df (merged):\n', df)

# 散布図をプロット

target_col = 'チョコレート菓子 (円/日)'

df.plot(kind='scatter', x=hitemp_col, y=target_col, figsize=(5, 5))

plt.show()

図7　チョコレート菓子支出金額の単回帰の結果を示すPythonコード

from sklearn.linear_model import LinearRegression

reg = LinearRegression()  # モデルを準備
# データを準備
X = df&#91;&#91;hitemp_col]].values  # 平均最高気温（2次元配列）。「.values」は省略可
y = df&#91;target_col].values  # 支出金額（1次元配列）。「.values」は省略可
reg.fit(X, y)  # 学習（データに直線を当てはめる）
print('intercept:', reg.intercept_)  # 切片
print('coef:', reg.coef_&#91;0])  # 直線の傾き
print('R2:', reg.score(X, y))  # 決定係数
df.plot(kind='scatter', x=hitemp_col, y=target_col, figsize=(5, 5))
plt.plot(X, reg.predict(X), 'r')
plt.show()

from sklearn.linear_model import LinearRegression

reg = LinearRegression() # モデルを準備

# データを準備

X = df[[hitemp_col]].values # 平均最高気温（2次元配列）。「.values」は省略可

y = df[target_col].values # 支出金額（1次元配列）。「.values」は省略可

reg.fit(X, y) # 学習（データに直線を当てはめる）

print('intercept:', reg.intercept_) # 切片

print('coef:', reg.coef_[0]) # 直線の傾き

print('R2:', reg.score(X, y)) # 決定係数

df.plot(kind='scatter', x=hitemp_col, y=target_col, figsize=(5, 5))

plt.plot(X, reg.predict(X), 'r')

plt.show()

図10　チョコレート菓子支出金額の単回帰における残差をプロットする
Pythonコード

fig = plt.figure(figsize=(5, 5))
plt.scatter(X, y - reg.predict(X))  # 残差をプロット
plt.axhline(y=0, c='r', linestyle='-')  # 水平線をプロット
plt.ylim(&#91;-3, 3])
plt.xlabel(hitemp_col)
plt.ylabel('残差')
plt.show()

fig = plt.figure(figsize=(5, 5))

plt.scatter(X, y - reg.predict(X)) # 残差をプロット

plt.axhline(y=0, c='r', linestyle='-') # 水平線をプロット

plt.ylim([-3, 3])

plt.xlabel(hitemp_col)

plt.ylabel('残差')

plt.show()

図13　年を説明変数とした単回帰のためのPythonコード

from matplotlib import cm
from matplotlib.colors import ListedColormap

def ListedGradation(cmapname, num=10):
    """ LinearSegmentedColormap から ListedColormap を作成 """
    color_list = &#91;]
    cmap = cm.get_cmap(cmapname)
    for i in range(num):
        color = list(cmap(i/num))
        color_list.append(color)
    return ListedColormap(color_list)
# 年ごとに散布図の色を変える
df.plot(kind='scatter', x=hitemp_col, y=target_col, c=df&#91;'year'],
    cmap=ListedGradation('cividis', 11), sharex=False, figsize=(6, 5))
plt.show()
# 「年」を説明変数とする単回帰
reg_year = LinearRegression()
X = df&#91;&#91;'year']].values  # 年
y = df&#91;target_col].values  # 支出金額
reg_year.fit(X, y)  # 学習（データに直線を当てはめる）
print('intercept:', reg_year.intercept_)  # 切片
print('coef:', reg_year.coef_&#91;0])  # 直線の傾き
print('R2:', reg_year.score(X, y))  # 決定係数
df.plot(kind='scatter', x='year', y=target_col, figsize=(5, 5))
plt.plot(X&#91;:, 0], reg_year.predict(X), 'r')
plt.show()

from matplotlib import cm

from matplotlib.colors import ListedColormap

def ListedGradation(cmapname, num=10):

""" LinearSegmentedColormap から ListedColormap を作成 """

color_list = []

cmap = cm.get_cmap(cmapname)

for i in range(num):

color = list(cmap(i/num))

color_list.append(color)

return ListedColormap(color_list)

# 年ごとに散布図の色を変える

df.plot(kind='scatter', x=hitemp_col, y=target_col, c=df['year'],

cmap=ListedGradation('cividis', 11), sharex=False, figsize=(6, 5))

plt.show()

# 「年」を説明変数とする単回帰

reg_year = LinearRegression()

X = df[['year']].values # 年

y = df[target_col].values # 支出金額

reg_year.fit(X, y) # 学習（データに直線を当てはめる）

print('intercept:', reg_year.intercept_) # 切片

print('coef:', reg_year.coef_[0]) # 直線の傾き

print('R2:', reg_year.score(X, y)) # 決定係数

df.plot(kind='scatter', x='year', y=target_col, figsize=(5, 5))

plt.plot(X[:, 0], reg_year.predict(X), 'r')

plt.show()

図15　平均最高気温と年の二つを説明変数とした重回帰のためのPythonコード

from mpl_toolkits.mplot3d import Axes3D
import numpy as np

reg_multi = LinearRegression()
X = df&#91;&#91;hitemp_col, 'year']].values  # 二つの説明変数（2次元配列）
y = df&#91;target_col].values  # 支出金額（1次元配列）
reg_multi.fit(X, y)  # 学習（データに平面を当てはめる）
print('intercept:', reg_multi.intercept_)  # 切片
print('coef:', reg_multi.coef_)  # 平面の傾き
print('R2:', reg_multi.score(X, y))  # 決定係数
# 3次元散布図のプロット
def scatter3d(X, y, xlabels, ylabel):
    fig = plt.figure(figsize=(9, 6))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X&#91;:, 0], X&#91;:, 1], y)
    ax.set_xlabel(xlabels&#91;0])
    ax.set_ylabel(xlabels&#91;1])
    ax.set_zlabel(ylabel)
    ax.view_init(elev=30, azim=-60)
    return ax
scatter3d(X, y, &#91;hitemp_col, '年'], target_col)
plt.show()
# 平面（wireframe）を合わせてプロット
ax = scatter3d(X, y, &#91;hitemp_col, '年'], target_col)
xlim = ax.get_xlim()
ylim = ax.get_ylim()
mesh_x = np.meshgrid(np.arange(*xlim, (xlim&#91;1] - xlim&#91;0]) / 20),
    np.arange(*ylim, (ylim&#91;1] - ylim&#91;0]) / 20))
mesh_y = reg_multi.intercept_ + reg_multi.coef_&#91;0] * mesh_x&#91;0] \ 
    + reg_multi.coef_&#91;1] * mesh_x&#91;1]  # 回帰式より予測
ax.plot_wireframe(*mesh_x, mesh_y, color='r', linewidth=0.5)
plt.show()

from mpl_toolkits.mplot3d import Axes3D

import numpy as np

reg_multi = LinearRegression()

X = df[[hitemp_col, 'year']].values # 二つの説明変数（2次元配列）

y = df[target_col].values # 支出金額（1次元配列）

reg_multi.fit(X, y) # 学習（データに平面を当てはめる）

print('intercept:', reg_multi.intercept_) # 切片

print('coef:', reg_multi.coef_) # 平面の傾き

print('R2:', reg_multi.score(X, y)) # 決定係数

# 3次元散布図のプロット

def scatter3d(X, y, xlabels, ylabel):

fig = plt.figure(figsize=(9, 6))

ax = fig.add_subplot(111, projection='3d')

ax.scatter(X[:, 0], X[:, 1], y)

ax.set_xlabel(xlabels[0])

ax.set_ylabel(xlabels[1])

ax.set_zlabel(ylabel)

ax.view_init(elev=30, azim=-60)

return ax

scatter3d(X, y, [hitemp_col, '年'], target_col)

plt.show()

# 平面（wireframe）を合わせてプロット

ax = scatter3d(X, y, [hitemp_col, '年'], target_col)

xlim = ax.get_xlim()

ylim = ax.get_ylim()

mesh_x = np.meshgrid(np.arange(*xlim, (xlim[1] - xlim[0]) / 20),

np.arange(*ylim, (ylim[1] - ylim[0]) / 20))

mesh_y = reg_multi.intercept_ + reg_multi.coef_[0] * mesh_x[0] \

+ reg_multi.coef_[1] * mesh_x[1] # 回帰式より予測

ax.plot_wireframe(*mesh_x, mesh_y, color='r', linewidth=0.5)

plt.show()

図18　平均最高気温と年の二つを説明変数とした重回帰の残差をプロットするPythonコード

fig = plt.figure(figsize=(5, 5))
plt.scatter(X&#91;:, 0], y - reg_multi.predict(X)) # 残差をプロット
plt.axhline(y=0, c='r', linestyle='-') # 水平線をプロット
plt.ylim(&#91;-3, 3])plt.xlabel(hitemp_col)
plt.ylabel('残差')
plt.show()

fig = plt.figure(figsize=(5, 5))

plt.scatter(X[:, 0], y - reg_multi.predict(X)) # 残差をプロット

plt.axhline(y=0, c='r', linestyle='-') # 水平線をプロット

plt.ylim([-3, 3])plt.xlabel(hitemp_col)

plt.ylabel('残差')

plt.show()

図20　平均最高気温とアイスクリーム支出金額の散布図をプロットするPythonコード

target_col = 'アイスクリーム・シャーベット (円/日)'  # 目的変数を変える
df.plot(kind='scatter', x=hitemp_col, y=target_col, figsize=(5, 5))
plt.show()

target_col = 'アイスクリーム・シャーベット (円/日)' # 目的変数を変える

df.plot(kind='scatter', x=hitemp_col, y=target_col, figsize=(5, 5))

plt.show()

図22　多項式回帰モデルを使ったPythonコード

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

def poly_fit(df, xlabel, ylabel, degree=2, ylim=None):
    """ 多項式回帰 """
    y = df&#91;ylabel]
    pf = PolynomialFeatures(degree=degree, include_bias=False)
    Xpf = pf.fit_transform(df&#91;&#91;xlabel]])
    reg_pf = LinearRegression(normalize=True)
    # reg_pf = Ridge(normalize=True, alpha=0.01)  # （★）
    reg_pf.fit(Xpf, y)
    print('intercept:', reg_pf.intercept_)  # 切片
    print('coef:', reg_pf.coef_)  # 直線の傾き
    print('R2:', reg_pf.score(Xpf, y))  # 決定係数
    fig = plt.figure(figsize=(5, 5))
    plt.plot(X, y, '.', ms=8)
    plt.ylim(ylim)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    # x軸に等間隔に点を配置して回帰曲線を描く
    xlim = plt.xlim()
    X_lin = np.arange(*xlim, (xlim&#91;1] - xlim&#91;0])/50).reshape(-1, 1)
    Xpf_lin = pf.fit_transform(X_lin)
    ypf_pred = reg_pf.predict(Xpf_lin)
    plt.plot(X_lin, ypf_pred, color='r')
X = df&#91;&#91;hitemp_col]].values  # 平均最高気温
y = df&#91;target_col].values  # 支出金額
for p in &#91;1, 2, 3, 20]:  # 多項式の次数
    poly_fit(df, hitemp_col, target_col, degree=p)
    plt.title(f'p = {p}')
    plt.show()

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import Ridge

def poly_fit(df, xlabel, ylabel, degree=2, ylim=None):

""" 多項式回帰 """

y = df[ylabel]

pf = PolynomialFeatures(degree=degree, include_bias=False)

Xpf = pf.fit_transform(df[[xlabel]])

reg_pf = LinearRegression(normalize=True)

# reg_pf = Ridge(normalize=True, alpha=0.01) # （★）

reg_pf.fit(Xpf, y)

print('intercept:', reg_pf.intercept_) # 切片

print('coef:', reg_pf.coef_) # 直線の傾き

print('R2:', reg_pf.score(Xpf, y)) # 決定係数

fig = plt.figure(figsize=(5, 5))

plt.plot(X, y, '.', ms=8)

plt.ylim(ylim)

plt.xlabel(xlabel)

plt.ylabel(ylabel)

# x軸に等間隔に点を配置して回帰曲線を描く

xlim = plt.xlim()

X_lin = np.arange(*xlim, (xlim[1] - xlim[0])/50).reshape(-1, 1)

Xpf_lin = pf.fit_transform(X_lin)

ypf_pred = reg_pf.predict(Xpf_lin)

plt.plot(X_lin, ypf_pred, color='r')

X = df[[hitemp_col]].values # 平均最高気温

y = df[target_col].values # 支出金額

for p in [1, 2, 3, 20]: # 多項式の次数

poly_fit(df, hitemp_col, target_col, degree=p)

plt.title(f'p = {p}')

plt.show()