#!/usr/bin/env perl use strict; use warnings; use List::Util qw(sum); use Getopt::Long; my $opt_count = 0; # 実際の頻度を使用 my $opt_tfidf = 0; # TF-IDF で頻度補正 GetOptions( "count" => \$opt_count, "tfidf" => \$opt_tfidf, ); ### 入力 my @ents; while (<>) { chomp; next if /^\s*$/; next if /^\#/; push @ents, [split(/\s+/, $_)]; } ### 出現頻度カウント my $N = @ents; my @tf; my %df; for (my $i = 0; $i < $N; $i++) { if ($opt_count) { $tf[$i]{$_}++ for @{$ents[$i]}; } else { $tf[$i]{$_} = 1 for @{$ents[$i]}; } $df{$_}++ for keys %{$tf[$i]}; } for (my $i = 0; $i < $N; $i++) { my @vals = values %{$tf[$i]}; if ($opt_tfidf) { my $n = sum(@vals); $tf[$i]{$_} = $tf[$i]{$_}/$n * log($N / $df{$_}) for keys %{$tf[$i]}; } my $len = sqrt(sum(map {$_**2} @vals)); # ベクトルの長さ $_ /= $len for values %{$tf[$i]}; # 長さで正規化 } ### 出力 for (my $i = 0; $i < $N; $i++) { for (my $j = $i + 1; $j < $N; $j++) { my $sim = sum(map {$tf[$i]{$_} * ($tf[$j]{$_}||0)} keys %{$tf[$i]}); printf "%d %d %.8f\n", $i+1, $j+1, $sim; } }
% cat test-1.txt hoge huga huga foo foo foo hoge hoge hoge huga % ./cossim.pl -c test-1.txt 1 2 0.42257713 (ref. [3]) % cat test-2.txt リンゴ リンゴ バナナ リンゴ バナナ ミカン % ./cossim.pl test-2.txt 1 2 0.81649658 (ref. [4]) % cat test-3.txt 日本 今日 今日 今日 高校 高校 国語 日本 日本 明日 大学 数学 % ./cossim.pl -c test-3.txt 1 2 0.19518001 (ref. [5]) % cat cossim-test.txt 六本木 渋谷 恵比寿 目黒 目黒 六本木 渋谷 渋谷 恵比寿 六本木 六本木 渋谷 渋谷 目黒 % ./cossim.pl cossim-test.txt 1 2 0.86602540 1 3 0.86602540 2 3 0.66666667 % ./cossim.pl -c cossim-test.txt 1 2 0.61721340 1 3 0.75592895 2 3 0.81649658 % ./cossim.pl -t cossim-test.txt 1 2 0.00395490 1 3 0.00395490 2 3 0.00000000 % ./cossim.pl -c -t cossim-test.txt 1 2 0.00126839 1 3 0.00165702 2 3 0.00000000
#!/usr/bin/env perl use strict; use warnings; use JSON; use Net::Twitter; use utf8; binmode STDOUT, ":utf8"; $| = 1; my $nt = Net::Twitter->new( traits => [qw/API::RESTv1_1/], ssl => 1, consumer_key => 'XXXXXX', consumer_secret => 'XXXXXX', access_token => 'XXXXXX', access_token_secret => 'XXXXXX', ); while (<>) { chomp; next if not /^\d+$/; my $rs = eval { $nt->show_status($_) }; print to_json($rs)."\n" if $rs; sleep 6; };
% cat test.txt 614651698812973056 614290454532657152 611666348645552128 611473704011169792 % ./twitterapi_id2tweet.pl test.txt | fold -60 {"retweeted":false,"source":"<a href=\"http://ifttt.com\" re l=\"nofollow\">IFTTT</a>","favorited":false,"coordinates":nu ll,"place":null,"retweet_count":0,"possibly_sensitive_appeal able":false,"entities":{"media":[{"display_url":"pic.twitter ...