バーティカルバーの極意（Vol.69掲載）

著者：飯尾淳

Twitterのトレンドを分析に関する解説の最終回です。これまで、TwitterのトレンドAPIを叩いてトレンドを集め、Twitter Standard search APIでトレンドに関するツイートを収集、そのデータに基づいてトレンドの構造を分析する手法を紹介しました。今回は、その日の主要なトピックは何だったかを可視化する方法を解説します。

シェルスクリプトマガジン Vol.69は以下のリンク先でご購入できます。

図7　JSONデータを得るスクリプト（get_data.py）

#!/usr/bin/env python

import json
import urllib.request
import sys
import re

# replace urlbase to the base address of your application
# this script should be called as $ ./get_data.py 2019-02-01
# which means the argument should be a date formatted in YYYY-MM-DD
urlbase = 'http://iiojun.xyz/twt'
jsonapi = '/api/' + sys.argv[1]

word_dict = {}
words = [] 

req = urllib.request.Request(urlbase+jsonapi)
with urllib.request.urlopen(req) as res:
  content = json.loads(res.read().decode('utf8'))
  # loop for labels
  for item in content:
    label = item['label']
    wid = item['id']
    url2 = "{0}/api/trends/{1}".format(urlbase, wid)
    req2 = urllib.request.Request(url2)
    dct = {}
    # loop for words of each label
    with urllib.request.urlopen(req2) as res2:
      content2 = json.loads(res2.read().decode('utf8'))
      for item2 in content2[0]:
        word = item2['word']
        freq = item2['freq']
        dct[word] = freq
        # keep the all words used in the nodes in the array 'words'
        if not word in words: words.append(word)
      # keep the ary of {word, freq} in the dictionary 'word_dict'
      word_dict[label] = dct
print("label", end="")
for item in words:
  print("\t{0}".format(item), end="")
print()

for key in word_dict:
  print(key, end="")
  dct = word_dict[key]
  for item in words:
    freq = dct[item] if item in dct else 0.0
    print("\t%6.3f" % freq, end="")
  print()

図9　コサイン類似度を計算するスクリプト（calc_cos_sim.py）

#!/usr/bin/env python3

import numpy as np
import sys

def cos_sim(v1, v2):
  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

lines = sys.stdin.readlines()

lines.pop(0)
wdic = {}
visited = []

for line in lines:
  words = line.split('\t')
  label = words.pop(0)
  wdic[label] = list(map(lambda x: float(x),words))

for key1 in wdic:
  for key2 in wdic:
    visited.append(key2+key1)
    if key1 == key2 or key1+key2 in visited: continue
    print("{0}\t{1}\t{2:9.6f}"
       .format(key1,key2,cos_sim(wdic[key1],wdic[key2])))

図11　dotスクリプトを作成するスクリプト（mk_net.rb）

#!/usr/bin/ruby

word_ary = []
freq_ary = []
node_set = []
node_color = {}
color_tbl = %w(khaki lemonchiffon aliceblue mistyrose coral
  goldenrod aquamarine lavender palegreen gold lightpink 
  plum yellow lightcyan lavenderblush gainsboro 
  yellowgreen lightsteelblue palegoldenrod lightskyblue 
  greenyellow plum cornflowerblue)
th_val = 0.75
mthd = 'fdp'
title = 'topicmap'

STDIN.each {|line|
  (kw1,kw2,freq) = line.split(/\t/)
  word_ary.push(kw1) unless word_ary.include?(kw1)
  word_ary.push(kw2) unless word_ary.include?(kw2)
  if freq.to_f > th_val
    freq_ary.push(line)
    flag = false
    node_set.each {|x|
      if x.include?(kw1) && x.include?(kw2)
        flag = true
        break
      end
      len0 = x.length
      x.push(kw2) if x.include?(kw1) && !x.include?(kw2)
      x.push(kw1) if !x.include?(kw1) && x.include?(kw2)
      if len0 < x.length
        flag = true
        break
      end
    }
    node_set.push([kw1, kw2]) unless flag
  end 
}

def get_set(ary_of_ary, x)
  ret_ary = []
  ary_of_ary.each {|ary|
    ret_ary = ret_ary + ary if ary.include?(x)
  }
  return ret_ary
end
def delete_set(ary_of_ary, x)
  ary_of_ary.each {|ary|
    ary_of_ary.delete(ary) if ary.include?(x)
  }
end

freq_ary.each {|x|
  (kw1,kw2,freq) = x.split(/\t/)
  x1 = get_set(node_set, kw1)
  x2 = get_set(node_set, kw2)
  next if (x1 == x2)
  x3 = x1 | x2
  delete_set(node_set, kw1)
  delete_set(node_set, kw2)
  node_set.push(x3)
}

word_ary.map {|x| node_color[x] = 'white' }

node_set.each_with_index {|value,index|
  i = index % color_tbl.length
  value.map{|x| node_color[x] = color_tbl[i] }
}

print "graph \"#{title}\" {\n"
print " graph [\n    layout = #{mthd}\n  ];\n"

word_ary.each {|x|
  printf "  \"%s\" [ fontname = \"ヒラギノ丸ゴ\"; style = \"filled\"; fillcolor = \"%s\"; fontcolor = \"%s\" ];\n", x, node_color[x], 'black'
}

while (!freq_ary.empty?) do
  (f,t,prob) = freq_ary.shift.split(/\t/)
  printf "  \"%s\" -- \"%s\" [label = \"%4.2f\"];\n", f, t, prob
end

print "}\n"