#!/usr/bin/env perl
use strict;
use warnings;
use List::Util qw(sum);
use Getopt::Long;
my $opt_count = 0; # 実際の頻度を使用
my $opt_tfidf = 0; # TF-IDF で頻度補正
GetOptions(
"count" => \$opt_count,
"tfidf" => \$opt_tfidf,
);
### 入力
my @ents;
while (<>) {
chomp;
next if /^\s*$/;
next if /^\#/;
push @ents, [split(/\s+/, $_)];
}
### 出現頻度カウント
my $N = @ents;
my @tf;
my %df;
for (my $i = 0; $i < $N; $i++) {
if ($opt_count) {
$tf[$i]{$_}++ for @{$ents[$i]};
} else {
$tf[$i]{$_} = 1 for @{$ents[$i]};
}
$df{$_}++ for keys %{$tf[$i]};
}
for (my $i = 0; $i < $N; $i++) {
my @vals = values %{$tf[$i]};
if ($opt_tfidf) {
my $n = sum(@vals);
$tf[$i]{$_} = $tf[$i]{$_}/$n * log($N / $df{$_}) for keys %{$tf[$i]};
}
my $len = sqrt(sum(map {$_**2} @vals)); # ベクトルの長さ
$_ /= $len for values %{$tf[$i]}; # 長さで正規化
}
### 出力
for (my $i = 0; $i < $N; $i++) {
for (my $j = $i + 1; $j < $N; $j++) {
my $sim = sum(map {$tf[$i]{$_} * ($tf[$j]{$_}||0)} keys %{$tf[$i]});
printf "%d %d %.8f\n", $i+1, $j+1, $sim;
}
}
% cat test-1.txt hoge huga huga foo foo foo hoge hoge hoge huga % ./cossim.pl -c test-1.txt 1 2 0.42257713 (ref. [3]) % cat test-2.txt リンゴ リンゴ バナナ リンゴ バナナ ミカン % ./cossim.pl test-2.txt 1 2 0.81649658 (ref. [4]) % cat test-3.txt 日本 今日 今日 今日 高校 高校 国語 日本 日本 明日 大学 数学 % ./cossim.pl -c test-3.txt 1 2 0.19518001 (ref. [5]) % cat cossim-test.txt 六本木 渋谷 恵比寿 目黒 目黒 六本木 渋谷 渋谷 恵比寿 六本木 六本木 渋谷 渋谷 目黒 % ./cossim.pl cossim-test.txt 1 2 0.86602540 1 3 0.86602540 2 3 0.66666667 % ./cossim.pl -c cossim-test.txt 1 2 0.61721340 1 3 0.75592895 2 3 0.81649658 % ./cossim.pl -t cossim-test.txt 1 2 0.00395490 1 3 0.00395490 2 3 0.00000000 % ./cossim.pl -c -t cossim-test.txt 1 2 0.00126839 1 3 0.00165702 2 3 0.00000000
#!/usr/bin/env perl
use strict;
use warnings;
use JSON;
use Net::Twitter;
use utf8;
binmode STDOUT, ":utf8";
$| = 1;
my $nt = Net::Twitter->new(
traits => [qw/API::RESTv1_1/],
ssl => 1,
consumer_key => 'XXXXXX',
consumer_secret => 'XXXXXX',
access_token => 'XXXXXX',
access_token_secret => 'XXXXXX',
);
while (<>) {
chomp;
next if not /^\d+$/;
my $rs = eval { $nt->show_status($_) };
print to_json($rs)."\n" if $rs;
sleep 6;
};
% cat test.txt
614651698812973056
614290454532657152
611666348645552128
611473704011169792
% ./twitterapi_id2tweet.pl test.txt | fold -60
{"retweeted":false,"source":"<a href=\"http://ifttt.com\" re
l=\"nofollow\">IFTTT</a>","favorited":false,"coordinates":nu
ll,"place":null,"retweet_count":0,"possibly_sensitive_appeal
able":false,"entities":{"media":[{"display_url":"pic.twitter
...