著者:飯尾 淳
Twitterのトレンドを分析に関する解説の最終回です。これまで、TwitterのトレンドAPIを叩いてトレンドを集め、Twitter Standard search APIでトレンドに関するツイートを収集、そのデータに基づいてトレンドの構造を分析する手法を紹介しました。今回は、その日の主要なトピックは何だったかを可視化する方法を解説します。
シェルスクリプトマガジン Vol.69は以下のリンク先でご購入できます。![]()
![]()
図7 JSONデータを得るスクリプト(get_data.py)
#!/usr/bin/env python
import json
import urllib.request
import sys
import re
# replace urlbase to the base address of your application
# this script should be called as $ ./get_data.py 2019-02-01
# which means the argument should be a date formatted in YYYY-MM-DD
urlbase = 'http://iiojun.xyz/twt'
jsonapi = '/api/' + sys.argv[1]
word_dict = {}
words = []
req = urllib.request.Request(urlbase+jsonapi)
with urllib.request.urlopen(req) as res:
content = json.loads(res.read().decode('utf8'))
# loop for labels
for item in content:
label = item['label']
wid = item['id']
url2 = "{0}/api/trends/{1}".format(urlbase, wid)
req2 = urllib.request.Request(url2)
dct = {}
# loop for words of each label
with urllib.request.urlopen(req2) as res2:
content2 = json.loads(res2.read().decode('utf8'))
for item2 in content2[0]:
word = item2['word']
freq = item2['freq']
dct[word] = freq
# keep the all words used in the nodes in the array 'words'
if not word in words: words.append(word)
# keep the ary of {word, freq} in the dictionary 'word_dict'
word_dict[label] = dct
print("label", end="")
for item in words:
print("\t{0}".format(item), end="")
print()
for key in word_dict:
print(key, end="")
dct = word_dict[key]
for item in words:
freq = dct[item] if item in dct else 0.0
print("\t%6.3f" % freq, end="")
print()
図9 コサイン類似度を計算するスクリプト(calc_cos_sim.py)
#!/usr/bin/env python3
import numpy as np
import sys
def cos_sim(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
lines = sys.stdin.readlines()
lines.pop(0)
wdic = {}
visited = []
for line in lines:
words = line.split('\t')
label = words.pop(0)
wdic[label] = list(map(lambda x: float(x),words))
for key1 in wdic:
for key2 in wdic:
visited.append(key2+key1)
if key1 == key2 or key1+key2 in visited: continue
print("{0}\t{1}\t{2:9.6f}"
.format(key1,key2,cos_sim(wdic[key1],wdic[key2])))
図11 dotスクリプトを作成するスクリプト(mk_net.rb)
#!/usr/bin/ruby
word_ary = []
freq_ary = []
node_set = []
node_color = {}
color_tbl = %w(khaki lemonchiffon aliceblue mistyrose coral
goldenrod aquamarine lavender palegreen gold lightpink
plum yellow lightcyan lavenderblush gainsboro
yellowgreen lightsteelblue palegoldenrod lightskyblue
greenyellow plum cornflowerblue)
th_val = 0.75
mthd = 'fdp'
title = 'topicmap'
STDIN.each {|line|
(kw1,kw2,freq) = line.split(/\t/)
word_ary.push(kw1) unless word_ary.include?(kw1)
word_ary.push(kw2) unless word_ary.include?(kw2)
if freq.to_f > th_val
freq_ary.push(line)
flag = false
node_set.each {|x|
if x.include?(kw1) && x.include?(kw2)
flag = true
break
end
len0 = x.length
x.push(kw2) if x.include?(kw1) && !x.include?(kw2)
x.push(kw1) if !x.include?(kw1) && x.include?(kw2)
if len0 < x.length
flag = true
break
end
}
node_set.push([kw1, kw2]) unless flag
end
}
def get_set(ary_of_ary, x)
ret_ary = []
ary_of_ary.each {|ary|
ret_ary = ret_ary + ary if ary.include?(x)
}
return ret_ary
end
def delete_set(ary_of_ary, x)
ary_of_ary.each {|ary|
ary_of_ary.delete(ary) if ary.include?(x)
}
end
freq_ary.each {|x|
(kw1,kw2,freq) = x.split(/\t/)
x1 = get_set(node_set, kw1)
x2 = get_set(node_set, kw2)
next if (x1 == x2)
x3 = x1 | x2
delete_set(node_set, kw1)
delete_set(node_set, kw2)
node_set.push(x3)
}
word_ary.map {|x| node_color[x] = 'white' }
node_set.each_with_index {|value,index|
i = index % color_tbl.length
value.map{|x| node_color[x] = color_tbl[i] }
}
print "graph \"#{title}\" {\n"
print " graph [\n layout = #{mthd}\n ];\n"
word_ary.each {|x|
printf " \"%s\" [ fontname = \"ヒラギノ丸ゴ\"; style = \"filled\"; fillcolor = \"%s\"; fontcolor = \"%s\" ];\n", x, node_color[x], 'black'
}
while (!freq_ary.empty?) do
(f,t,prob) = freq_ary.shift.split(/\t/)
printf " \"%s\" -- \"%s\" [label = \"%4.2f\"];\n", f, t, prob
end
print "}\n"