香川大学SLPからお届け！（Vol.79掲載）

著者：岩本和真

人間の言語をコンピュータで処理する自然言語処理の分野は、近年、急速に進歩しています。それによって例えば、自然な翻訳や文章生成をする技術などが開発されています。同分野の学習成果として筆者は、「秘書チャット」と名付けたチャットボットアプリを作成しました。今回は、このアプリに盛り込んだ機能について、技術的に解説します。

シェルスクリプトマガジン Vol.79は以下のリンク先でご購入できます。

図5　形態素解析処理をするmorpheme()関数の定義コード

def morpheme(self, input, speech=False):
  input=unicodedata.normalize('NFKC', input)
    if speech:
      speech_list = []
      sentence = self.wakati.parse(input).split()
      node = self.wakati.parseToNode(input)
      while node:
        if node.feature.split(",")[0] != "BOS/EOS":
          speech_list.append(node.feature.split(",")[0])    
        node = node.next
      return sentence, speech_list
    else:
      sentence = self.wakati.parse(input).split()
      return sentence

def morpheme(self, input, speech=False):

input=unicodedata.normalize('NFKC', input)

if speech:

speech_list = []

sentence = self.wakati.parse(input).split()

node = self.wakati.parseToNode(input)

while node:

if node.feature.split(",")[0] != "BOS/EOS":

speech_list.append(node.feature.split(",")[0])

node = node.next

return sentence, speech_list

else:

sentence = self.wakati.parse(input).split()

return sentence

図8　予定の内容を抽出するcontent_extract()関数の定義コード

def content_extract(self, input_list, speech_list):
  out_list = []
  ban_word = ["覚え", "記憶"]
  pass_word = ["予定", "こと"]
  for i, input in enumerate(input_list):
    if input in ban_word:
      break
    elif input in pass_word:
      continue
    elif input.isdecimal() and \
         input_list[i+1] in self.date_key:
      continue
    elif (input_list[i-1]).isdecimal() and \
         input in self.date_key:
      continue
    elif out_list != [] and speech_list[i-1] == "名詞" and \
         speech_list[i+1] == "名詞" and \
         input_list[i+1] not in pass_word:
      out_list.append(input_list[i])
    elif speech_list[i] == "名詞":
      out_list.append(input)
    else:
      continue
  return ''.join(out_list)

def content_extract(self, input_list, speech_list):

out_list = []

ban_word = ["覚え", "記憶"]

pass_word = ["予定", "こと"]

for i, input in enumerate(input_list):

if input in ban_word:

break

elif input in pass_word:

continue

elif input.isdecimal() and \

input_list[i+1] in self.date_key:

continue

elif (input_list[i-1]).isdecimal() and \

input in self.date_key:

continue

elif out_list != [] and speech_list[i-1] == "名詞" and \

speech_list[i+1] == "名詞" and \

input_list[i+1] not in pass_word:

out_list.append(input_list[i])

elif speech_list[i] == "名詞":

out_list.append(input)

else:

continue

return ''.join(out_list)

図9　Webページのテキスト情報を抽出するscraping()関数の定義コード

from bs4 import BeautifulSoup
import requests

def scraping(url, file_path):
  responses = requests.get(url)
  soup = BeautifulSoup(responses.content, 'html.parser')
  text_list = soup.get_text().splitlines()
  text_list = list(set(text_list))
  text_list = [text.replace('\u3000', '') for text in text_list]
  text = '\n'.join(text_list)
  with open(file_path, 'w', encoding='utf_8') as f:
    f.write(text)

from bs4 import BeautifulSoup

import requests

def scraping(url, file_path):

responses = requests.get(url)

soup = BeautifulSoup(responses.content, 'html.parser')

text_list = soup.get_text().splitlines()

text_list = list(set(text_list))

text_list = [text.replace('\u3000', '') for text in text_list]

text = '\n'.join(text_list)

with open(file_path, 'w', encoding='utf_8') as f:

f.write(text)

図12　曜日や豆知識を答える機能のコード

def week_teach(self, input):
  year = None
  month = None
  day = None
  input = self.date_update.convert(input)
  input_list = self.morpheme(input)
  if "年" in input_list:
    year = self.date_specify("年", input_list)
  else:
    year = self.year
  if "月" in input_list:
    month = self.date_specify("月", input_list)
  else:
    month = self.month
  day = self.date_specify("日", input_list)
  d_key = dt.date(year, month, day)
  week_key = d_key.weekday()
  return year, month, day, self.week_list[week_key]
    
def knowledge_teach(self):
  file_path = "text_data/min_kl.txt"
  with open(file_path, 'r', encoding='UTF-8') as f:
    knowledge_data = f.readlines()
  knowledge = random.choice(knowledge_data)
  return knowledge

def week_teach(self, input):

year = None

month = None

day = None

input = self.date_update.convert(input)

input_list = self.morpheme(input)

if "年" in input_list:

year = self.date_specify("年", input_list)

else:

year = self.year

if "月" in input_list:

month = self.date_specify("月", input_list)

else:

month = self.month

day = self.date_specify("日", input_list)

d_key = dt.date(year, month, day)

week_key = d_key.weekday()

return year, month, day, self.week_list[week_key]

def knowledge_teach(self):

file_path = "text_data/min_kl.txt"

with open(file_path, 'r', encoding='UTF-8') as f:

knowledge_data = f.readlines()

knowledge = random.choice(knowledge_data)

return knowledge