バーティカルバーの極意（Vol.69掲載）

著者：飯尾淳

Twitterのトレンドを分析に関する解説の最終回です。これまで、TwitterのトレンドAPIを叩いてトレンドを集め、Twitter Standard search APIでトレンドに関するツイートを収集、そのデータに基づいてトレンドの構造を分析する手法を紹介しました。今回は、その日の主要なトピックは何だったかを可視化する方法を解説します。

シェルスクリプトマガジン Vol.69は以下のリンク先でご購入できます。

図7　JSONデータを得るスクリプト（get_data.py）

#!/usr/bin/env python

import json
import urllib.request
import sys
import re

# replace urlbase to the base address of your application
# this script should be called as $ ./get_data.py 2019-02-01
# which means the argument should be a date formatted in YYYY-MM-DD
urlbase = 'http://iiojun.xyz/twt'
jsonapi = '/api/' + sys.argv[1]

word_dict = {}
words = [] 

req = urllib.request.Request(urlbase+jsonapi)
with urllib.request.urlopen(req) as res:
  content = json.loads(res.read().decode('utf8'))
  # loop for labels
  for item in content:
    label = item['label']
    wid = item['id']
    url2 = "{0}/api/trends/{1}".format(urlbase, wid)
    req2 = urllib.request.Request(url2)
    dct = {}
    # loop for words of each label
    with urllib.request.urlopen(req2) as res2:
      content2 = json.loads(res2.read().decode('utf8'))
      for item2 in content2[0]:
        word = item2['word']
        freq = item2['freq']
        dct[word] = freq
        # keep the all words used in the nodes in the array 'words'
        if not word in words: words.append(word)
      # keep the ary of {word, freq} in the dictionary 'word_dict'
      word_dict[label] = dct
print("label", end="")
for item in words:
  print("\t{0}".format(item), end="")
print()

for key in word_dict:
  print(key, end="")
  dct = word_dict[key]
  for item in words:
    freq = dct[item] if item in dct else 0.0
    print("\t%6.3f" % freq, end="")
  print()

#!/usr/bin/env python

import json

import urllib.request

import sys

import re

# replace urlbase to the base address of your application

# this script should be called as $ ./get_data.py 2019-02-01

# which means the argument should be a date formatted in YYYY-MM-DD

urlbase = 'http://iiojun.xyz/twt'

jsonapi = '/api/' + sys.argv[1]

word_dict = {}

words = []

req = urllib.request.Request(urlbase+jsonapi)

with urllib.request.urlopen(req) as res:

content = json.loads(res.read().decode('utf8'))

# loop for labels

for item in content:

label = item['label']

wid = item['id']

url2 = "{0}/api/trends/{1}".format(urlbase, wid)

req2 = urllib.request.Request(url2)

dct = {}

# loop for words of each label

with urllib.request.urlopen(req2) as res2:

content2 = json.loads(res2.read().decode('utf8'))

for item2 in content2[0]:

word = item2['word']

freq = item2['freq']

dct[word] = freq

# keep the all words used in the nodes in the array 'words'

if not word in words: words.append(word)

# keep the ary of {word, freq} in the dictionary 'word_dict'

word_dict[label] = dct

print("label", end="")

for item in words:

print("\t{0}".format(item), end="")

print()

for key in word_dict:

print(key, end="")

dct = word_dict[key]

for item in words:

freq = dct[item] if item in dct else 0.0

print("\t%6.3f" % freq, end="")

print()

図9　コサイン類似度を計算するスクリプト（calc_cos_sim.py）

#!/usr/bin/env python3

import numpy as np
import sys

def cos_sim(v1, v2):
  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

lines = sys.stdin.readlines()

lines.pop(0)
wdic = {}
visited = []

for line in lines:
  words = line.split('\t')
  label = words.pop(0)
  wdic[label] = list(map(lambda x: float(x),words))

for key1 in wdic:
  for key2 in wdic:
    visited.append(key2+key1)
    if key1 == key2 or key1+key2 in visited: continue
    print("{0}\t{1}\t{2:9.6f}"
       .format(key1,key2,cos_sim(wdic[key1],wdic[key2])))

#!/usr/bin/env python3

import numpy as np

import sys

def cos_sim(v1, v2):

return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

lines = sys.stdin.readlines()

lines.pop(0)

wdic = {}

visited = []

for line in lines:

words = line.split('\t')

label = words.pop(0)

wdic[label] = list(map(lambda x: float(x),words))

for key1 in wdic:

for key2 in wdic:

visited.append(key2+key1)

if key1 == key2 or key1+key2 in visited: continue

print("{0}\t{1}\t{2:9.6f}"

.format(key1,key2,cos_sim(wdic[key1],wdic[key2])))

図11　dotスクリプトを作成するスクリプト（mk_net.rb）

#!/usr/bin/ruby

word_ary = []
freq_ary = []
node_set = []
node_color = {}
color_tbl = %w(khaki lemonchiffon aliceblue mistyrose coral
  goldenrod aquamarine lavender palegreen gold lightpink 
  plum yellow lightcyan lavenderblush gainsboro 
  yellowgreen lightsteelblue palegoldenrod lightskyblue 
  greenyellow plum cornflowerblue)
th_val = 0.75
mthd = 'fdp'
title = 'topicmap'

STDIN.each {|line|
  (kw1,kw2,freq) = line.split(/\t/)
  word_ary.push(kw1) unless word_ary.include?(kw1)
  word_ary.push(kw2) unless word_ary.include?(kw2)
  if freq.to_f > th_val
    freq_ary.push(line)
    flag = false
    node_set.each {|x|
      if x.include?(kw1) && x.include?(kw2)
        flag = true
        break
      end
      len0 = x.length
      x.push(kw2) if x.include?(kw1) && !x.include?(kw2)
      x.push(kw1) if !x.include?(kw1) && x.include?(kw2)
      if len0 < x.length
        flag = true
        break
      end
    }
    node_set.push([kw1, kw2]) unless flag
  end 
}

def get_set(ary_of_ary, x)
  ret_ary = []
  ary_of_ary.each {|ary|
    ret_ary = ret_ary + ary if ary.include?(x)
  }
  return ret_ary
end
def delete_set(ary_of_ary, x)
  ary_of_ary.each {|ary|
    ary_of_ary.delete(ary) if ary.include?(x)
  }
end

freq_ary.each {|x|
  (kw1,kw2,freq) = x.split(/\t/)
  x1 = get_set(node_set, kw1)
  x2 = get_set(node_set, kw2)
  next if (x1 == x2)
  x3 = x1 | x2
  delete_set(node_set, kw1)
  delete_set(node_set, kw2)
  node_set.push(x3)
}

word_ary.map {|x| node_color[x] = 'white' }

node_set.each_with_index {|value,index|
  i = index % color_tbl.length
  value.map{|x| node_color[x] = color_tbl[i] }
}

print "graph \"#{title}\" {\n"
print " graph [\n    layout = #{mthd}\n  ];\n"

word_ary.each {|x|
  printf "  \"%s\" [ fontname = \"ヒラギノ丸ゴ\"; style = \"filled\"; fillcolor = \"%s\"; fontcolor = \"%s\" ];\n", x, node_color[x], 'black'
}

while (!freq_ary.empty?) do
  (f,t,prob) = freq_ary.shift.split(/\t/)
  printf "  \"%s\" -- \"%s\" [label = \"%4.2f\"];\n", f, t, prob
end

print "}\n"

#!/usr/bin/ruby

word_ary = []

freq_ary = []

node_set = []

node_color = {}

color_tbl = %w(khaki lemonchiffon aliceblue mistyrose coral

goldenrod aquamarine lavender palegreen gold lightpink

plum yellow lightcyan lavenderblush gainsboro

yellowgreen lightsteelblue palegoldenrod lightskyblue

greenyellow plum cornflowerblue)

th_val = 0.75

mthd = 'fdp'

title = 'topicmap'

STDIN.each {|line|

(kw1,kw2,freq) = line.split(/\t/)

word_ary.push(kw1) unless word_ary.include?(kw1)

word_ary.push(kw2) unless word_ary.include?(kw2)

if freq.to_f > th_val

freq_ary.push(line)

flag = false

node_set.each {|x|

if x.include?(kw1) && x.include?(kw2)

flag = true

break

end

len0 = x.length

x.push(kw2) if x.include?(kw1) && !x.include?(kw2)

x.push(kw1) if !x.include?(kw1) && x.include?(kw2)

if len0 < x.length

flag = true

break

end

}

node_set.push([kw1, kw2]) unless flag

end

}

def get_set(ary_of_ary, x)

ret_ary = []

ary_of_ary.each {|ary|

ret_ary = ret_ary + ary if ary.include?(x)

}

return ret_ary

end

def delete_set(ary_of_ary, x)

ary_of_ary.each {|ary|

ary_of_ary.delete(ary) if ary.include?(x)

}

end

freq_ary.each {|x|

(kw1,kw2,freq) = x.split(/\t/)

x1 = get_set(node_set, kw1)

x2 = get_set(node_set, kw2)

next if (x1 == x2)

x3 = x1 | x2

delete_set(node_set, kw1)

delete_set(node_set, kw2)

node_set.push(x3)

}

word_ary.map {|x| node_color[x] = 'white' }

node_set.each_with_index {|value,index|

i = index % color_tbl.length

value.map{|x| node_color[x] = color_tbl[i] }

}

print "graph \"#{title}\" {\n"

print " graph [\n layout = #{mthd}\n ];\n"

word_ary.each {|x|

printf " \"%s\" [ fontname = \"ヒラギノ丸ゴ\"; style = \"filled\"; fillcolor = \"%s\"; fontcolor = \"%s\" ];\n", x, node_color[x], 'black'

}

while (!freq_ary.empty?) do

(f,t,prob) = freq_ary.shift.split(/\t/)

printf " \"%s\" -- \"%s\" [label = \"%4.2f\"];\n", f, t, prob

end

print "}\n"