Pythonあれこれ（Vol.95掲載）

著者：飯尾淳

本連載では「Pythonを昔から使っているものの、それほど使いこなしてはいない」という筆者が、いろいろな日常業務をPythonで処理することで、立派な「蛇使い」に育つことを目指します。その過程を温かく見守ってください。皆さんと共に勉強していきましょう。第25回では、人間がどのように文章の意味を理解するかという認知能力に関わる、少し不思議で面白い現象について取り上げます。

シェルスクリプトマガジン Vol.95は以下のリンク先でご購入できます。

図4　タイポグリセミア度を算出する関数tl()の定義コード

import re

# タイポグリセミアに関係する部分を取り出して文字のリストを返す
def s2cl(s):
  sp_chars = '[,.;:．，。、?？!！]'
  s2 = re.sub(sp_chars, '', s.strip()).split()
  return list("".join(
    [x[1:-1] for x in filter(lambda x: len(x) > 3, s2)]
  ))

# タイポグリセミア度を計算する関数
def tl(s1, s2):
  s1_chars = s2cl(s1)
  s2_chars = s2cl(s2)
  # 入れ換えた文字を数える
  ctr = 0
  for i in range(len(s1_chars)):
    if s1_chars[i] != s2_chars[i]:
      ctr += 1
  # タイポグリセミア度を返す
  return ctr / len(s1_chars)

import re

# タイポグリセミアに関係する部分を取り出して文字のリストを返す

def s2cl(s):

sp_chars = '[,.;:．，。、?？!！]'

s2 = re.sub(sp_chars, '', s.strip()).split()

return list("".join(

[x[1:-1] for x in filter(lambda x: len(x) > 3, s2)]

))

# タイポグリセミア度を計算する関数

def tl(s1, s2):

s1_chars = s2cl(s1)

s2_chars = s2cl(s2)

# 入れ換えた文字を数える

ctr = 0

for i in range(len(s1_chars)):

if s1_chars[i] != s2_chars[i]:

ctr += 1

# タイポグリセミア度を返す

return ctr / len(s1_chars)

図5　関数make_candidate_index()の定義コード

def make_candidate_index(orig, debug=False):
  tmp = ' '
  candidate = ' '
  stopwords = ' ,.!?！？．。，、「」\"\#\*\&\n'
  indexes = []
  # ストップワード文字と、それに隣接する文字を空白に変換
  for i in range(1, len(orig)-1):
    tmp = tmp[:i] + \
          (' ' if orig[i-1] in stopwords \
                  or orig[i+1] in stopwords \
                  or orig[i] in stopwords \
               else orig[i])
  # 直前、直後がストップワード文字の文字を空白に変換
  for i in range(1, len(tmp)-1):
    candidate = candidate[:i] + \
                (' ' if (tmp[i-1] in stopwords \
                         and tmp[i+1] in stopwords) \
                        or (tmp[i] in stopwords) \
                     else tmp[i])
  # debug=True フラグで動作確認できるようにする
  if debug:
    print('ORG:' + orig)
    print('TMP:' + tmp)
    print('CDD:' + candidate)
  # 候補の文字位置のリストを要素にするリストを作成
  flag:bool = False
  for i in range(1, len(candidate)-1):
    if candidate[i] not in stopwords:
      if not flag: 
        flag = True; subindexes = [i]
      else: 
        subindexes.append(i)
    else:
      if flag: 
        indexes.append(subindexes)
      flag = False
  return indexes

def make_candidate_index(orig, debug=False):

tmp = ' '

candidate = ' '

stopwords = ' ,.!?！？．。，、「」\"\#\*\&\n'

indexes = []

# ストップワード文字と、それに隣接する文字を空白に変換

for i in range(1, len(orig)-1):

tmp = tmp[:i] + \

(' ' if orig[i-1] in stopwords \

or orig[i+1] in stopwords \

or orig[i] in stopwords \

else orig[i])

# 直前、直後がストップワード文字の文字を空白に変換

for i in range(1, len(tmp)-1):

candidate = candidate[:i] + \

(' ' if (tmp[i-1] in stopwords \

and tmp[i+1] in stopwords) \

or (tmp[i] in stopwords) \

else tmp[i])

# debug=True フラグで動作確認できるようにする

if debug:

print('ORG:' + orig)

print('TMP:' + tmp)

print('CDD:' + candidate)

# 候補の文字位置のリストを要素にするリストを作成

flag:bool = False

for i in range(1, len(candidate)-1):

if candidate[i] not in stopwords:

if not flag:

flag = True; subindexes = [i]

else:

subindexes.append(i)

else:

if flag:

indexes.append(subindexes)

flag = False

return indexes

図7　タイポグリセミア文を作成する関数typoglycemia()の定義コード

from functools import reduce
import random
　
def typoglycemia(orig, tlval, debug=False):
  if tlval < 0.0 or tlval > 1.0:
    print('tlvalは0.0以上1.0以下でなくてはいけません')
    return None
  # 最初にインデックスのリストを作成する
  indexes = make_candidate_index(orig)
  # indexesに含まれる全要素数を計算する
  denominator = len(reduce(lambda x, y: x+y, indexes))
  # 文字を交換すべき処理対象のリスト
  procs = []
  # 交換する文字数
  swap_nums = 0
  while swap_nums < tlval * denominator:
    # 対象を決める
    target_idx = random.randint(0, len(indexes) - 1)
    target_list = indexes[target_idx]
    if debug: print('---\n処理の対象: ' + str(target_list))
    # 単語が十分に長い場合は、処理対象の2文字を特定する
    if len(target_list) >= 4:
      char_idx = 0
      swap_idx = 0
      while char_idx == swap_idx:
        char_idx = random.randint(0, len(target_list) - 1)
        swap_idx = random.randint(0, len(target_list) - 1)
      c1 = target_list[char_idx]
      c2 = target_list[swap_idx]
      target_list.remove(c1)
      target_list.remove(c2)
      procs.append([c1, c2])
      swap_nums += 2
    # 処理対象の単語が2文字または3文字の場合
    elif len(target_list) == 2 or len(target_list) == 3:
      procs.append(target_list)
      indexes.remove(target_list)
      swap_nums += len(target_list)
      target_list = []
    else:
      print('エラー')
      return None
    if debug:
      print('処理リスト: ' + str(procs))
      print('処理後対象: ' + str(target_list))
  if debug: print('---\n残り: ' + str(indexes) + '\n')
  # procsに入れられた情報に基づき文字の入れ替え処理を行う
  chars:list[str] = list(orig)
  for l in procs:
    if len(l) == 2:
      tmp = chars[l[0]]
      chars[l[0]] = orig[l[1]]
      chars[l[1]] = tmp
    else:
      tmp = chars[l[0]]
      if random.randint(0, 1) == 0: # 時計回りの3文字入れ替え
        chars[l[0]] = orig[l[1]]
        chars[l[1]] = orig[l[2]]
        chars[l[2]] = tmp
      else:                         # 反時計回りの3文字入れ替え
        chars[l[0]] = orig[l[2]]
        chars[l[2]] = orig[l[1]]
        chars[l[1]] = tmp
  return ''.join(chars)

from functools import reduce

import random

def typoglycemia(orig, tlval, debug=False):

if tlval < 0.0 or tlval > 1.0:

print('tlvalは0.0以上1.0以下でなくてはいけません')

return None

# 最初にインデックスのリストを作成する

indexes = make_candidate_index(orig)

# indexesに含まれる全要素数を計算する

denominator = len(reduce(lambda x, y: x+y, indexes))

# 文字を交換すべき処理対象のリスト

procs = []

# 交換する文字数

swap_nums = 0

while swap_nums < tlval * denominator:

# 対象を決める

target_idx = random.randint(0, len(indexes) - 1)

target_list = indexes[target_idx]

if debug: print('---\n処理の対象: ' + str(target_list))

# 単語が十分に長い場合は、処理対象の2文字を特定する

if len(target_list) >= 4:

char_idx = 0

swap_idx = 0

while char_idx == swap_idx:

char_idx = random.randint(0, len(target_list) - 1)

swap_idx = random.randint(0, len(target_list) - 1)

c1 = target_list[char_idx]

c2 = target_list[swap_idx]

target_list.remove(c1)

target_list.remove(c2)

procs.append([c1, c2])

swap_nums += 2

# 処理対象の単語が2文字または3文字の場合

elif len(target_list) == 2 or len(target_list) == 3:

procs.append(target_list)

indexes.remove(target_list)

swap_nums += len(target_list)

target_list = []

else:

print('エラー')

return None

if debug:

print('処理リスト: ' + str(procs))

print('処理後対象: ' + str(target_list))

if debug: print('---\n残り: ' + str(indexes) + '\n')

# procsに入れられた情報に基づき文字の入れ替え処理を行う

chars:list[str] = list(orig)

for l in procs:

if len(l) == 2:

tmp = chars[l[0]]

chars[l[0]] = orig[l[1]]

chars[l[1]] = tmp

else:

tmp = chars[l[0]]

if random.randint(0, 1) == 0: # 時計回りの3文字入れ替え

chars[l[0]] = orig[l[1]]

chars[l[1]] = orig[l[2]]

chars[l[2]] = tmp

else: # 反時計回りの3文字入れ替え

chars[l[0]] = orig[l[2]]

chars[l[2]] = orig[l[1]]

chars[l[1]] = tmp

return ''.join(chars)

図8　タイポグリセミア度「0.7」の文を作成した例

org = 'The superpowered quick brown fox jumps over the beautiful lazy dog.'
typo = typoglycemia(org, 0.7, debug=True)
typo

org = 'The superpowered quick brown fox jumps over the beautiful lazy dog.'

typo = typoglycemia(org, 0.7, debug=True)

typo