著者:飯尾 淳
Twitterのトレンドを分析に関する解説の最終回です。これまで、TwitterのトレンドAPIを叩いてトレンドを集め、Twitter Standard search APIでトレンドに関するツイートを収集、そのデータに基づいてトレンドの構造を分析する手法を紹介しました。今回は、その日の主要なトピックは何だったかを可視化する方法を解説します。
シェルスクリプトマガジン Vol.69は以下のリンク先でご購入できます。
図7 JSONデータを得るスクリプト(get_data.py)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
#!/usr/bin/env python import json import urllib.request import sys import re # replace urlbase to the base address of your application # this script should be called as $ ./get_data.py 2019-02-01 # which means the argument should be a date formatted in YYYY-MM-DD urlbase = 'http://iiojun.xyz/twt' jsonapi = '/api/' + sys.argv[1] word_dict = {} words = [] req = urllib.request.Request(urlbase+jsonapi) with urllib.request.urlopen(req) as res: content = json.loads(res.read().decode('utf8')) # loop for labels for item in content: label = item['label'] wid = item['id'] url2 = "{0}/api/trends/{1}".format(urlbase, wid) req2 = urllib.request.Request(url2) dct = {} # loop for words of each label with urllib.request.urlopen(req2) as res2: content2 = json.loads(res2.read().decode('utf8')) for item2 in content2[0]: word = item2['word'] freq = item2['freq'] dct[word] = freq # keep the all words used in the nodes in the array 'words' if not word in words: words.append(word) # keep the ary of {word, freq} in the dictionary 'word_dict' word_dict[label] = dct print("label", end="") for item in words: print("\t{0}".format(item), end="") print() for key in word_dict: print(key, end="") dct = word_dict[key] for item in words: freq = dct[item] if item in dct else 0.0 print("\t%6.3f" % freq, end="") print() |
図9 コサイン類似度を計算するスクリプト(calc_cos_sim.py)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
#!/usr/bin/env python3 import numpy as np import sys def cos_sim(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) lines = sys.stdin.readlines() lines.pop(0) wdic = {} visited = [] for line in lines: words = line.split('\t') label = words.pop(0) wdic[label] = list(map(lambda x: float(x),words)) for key1 in wdic: for key2 in wdic: visited.append(key2+key1) if key1 == key2 or key1+key2 in visited: continue print("{0}\t{1}\t{2:9.6f}" .format(key1,key2,cos_sim(wdic[key1],wdic[key2]))) |
図11 dotスクリプトを作成するスクリプト(mk_net.rb)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
#!/usr/bin/ruby word_ary = [] freq_ary = [] node_set = [] node_color = {} color_tbl = %w(khaki lemonchiffon aliceblue mistyrose coral goldenrod aquamarine lavender palegreen gold lightpink plum yellow lightcyan lavenderblush gainsboro yellowgreen lightsteelblue palegoldenrod lightskyblue greenyellow plum cornflowerblue) th_val = 0.75 mthd = 'fdp' title = 'topicmap' STDIN.each {|line| (kw1,kw2,freq) = line.split(/\t/) word_ary.push(kw1) unless word_ary.include?(kw1) word_ary.push(kw2) unless word_ary.include?(kw2) if freq.to_f > th_val freq_ary.push(line) flag = false node_set.each {|x| if x.include?(kw1) && x.include?(kw2) flag = true break end len0 = x.length x.push(kw2) if x.include?(kw1) && !x.include?(kw2) x.push(kw1) if !x.include?(kw1) && x.include?(kw2) if len0 < x.length flag = true break end } node_set.push([kw1, kw2]) unless flag end } def get_set(ary_of_ary, x) ret_ary = [] ary_of_ary.each {|ary| ret_ary = ret_ary + ary if ary.include?(x) } return ret_ary end def delete_set(ary_of_ary, x) ary_of_ary.each {|ary| ary_of_ary.delete(ary) if ary.include?(x) } end freq_ary.each {|x| (kw1,kw2,freq) = x.split(/\t/) x1 = get_set(node_set, kw1) x2 = get_set(node_set, kw2) next if (x1 == x2) x3 = x1 | x2 delete_set(node_set, kw1) delete_set(node_set, kw2) node_set.push(x3) } word_ary.map {|x| node_color[x] = 'white' } node_set.each_with_index {|value,index| i = index % color_tbl.length value.map{|x| node_color[x] = color_tbl[i] } } print "graph \"#{title}\" {\n" print " graph [\n layout = #{mthd}\n ];\n" word_ary.each {|x| printf " \"%s\" [ fontname = \"ヒラギノ丸ゴ\"; style = \"filled\"; fillcolor = \"%s\"; fontcolor = \"%s\" ];\n", x, node_color[x], 'black' } while (!freq_ary.empty?) do (f,t,prob) = freq_ary.shift.split(/\t/) printf " \"%s\" -- \"%s\" [label = \"%4.2f\"];\n", f, t, prob end print "}\n" |