% ./zipf-gen.pl 10000 | head -1000000 > samp.txt % head samp.txt 32 2 105 3 33 586 1 1 924 247
#!/usr/bin/perl use strict; use warnings; my $gamma = 0.005; my %count; my $N; while (<>) { chomp; next if /^\s*$/; $count{$_}++; $N++; } foreach my $k (sort {$count{$b} <=> $count{$a}} keys %count) { last if $count{$k} < $gamma * $N; print "$k $count{$k}\n"; } print "max hash size = ".scalar(keys %count)."\n";
% ./naive-freq.pl samp.txt 1 102069 2 51389 3 33944 4 25432 5 20513 6 16856 7 14365 8 12780 9 11436 10 10353 11 9108 12 8528 13 8039 14 7259 15 6931 16 6377 17 6028 18 5704 19 5332 20 5050 21 5002 max hash size = 10000
#!/usr/bin/perl use strict; use warnings; my $gamma = 0.005; my $epsilon = 0.004; my $b_current = 1; my %count; my $N; my $max_hash_size = 0; while (<>) { chomp; next if /^\s*$/; add($_); $N++; if ($N % int(1/$epsilon) == 0) { next_backet(); } } foreach my $k (sort {$count{$b}{f} <=> $count{$a}{f}} keys %count) { if ($count{$k}{f} >= ($gamma - $epsilon) * $N) { print "$k $count{$k}{f} $count{$k}{d}\n"; } } print "max hash size = $max_hash_size\n"; sub add { my ($k) = @_; if (not $count{$k}) { $count{$k}{f} = 1; $count{$k}{d} = $b_current - 1; } else { $count{$k}{f}++; } } sub next_backet { my $hash_size = scalar(keys %count); $max_hash_size = $hash_size if $max_hash_size < $hash_size; foreach my $k (keys %count) { if ($count{$k}{f} <= $b_current - $count{$k}{d}) { delete $count{$k}; } } $b_current++; }
% ./lcm-freq.pl samp.txt 1 102069 0 2 51389 0 3 33944 0 4 25432 0 5 20513 0 6 16856 0 7 14365 0 8 12780 1 9 11436 0 10 10353 0 11 9108 0 12 8527 1 13 8039 0 14 7259 0 15 6929 4 16 6377 1 17 6027 1 18 5702 2 19 5327 5 20 5047 6 21 5002 0 22 4601 16 23 4379 11 24 3945 344 25 3632 421 max hash size = 203
単純カウント | LCMカウント | |
---|---|---|
最大ハッシュサイズ | 10000 | 203 |
ラベル | 頻度(単純カウント) | 頻度(LCMカウント) |
---|---|---|
1 | 102069 | 102069 |
2 | 51389 | 51389 |
3 | 33944 | 33944 |
4 | 25432 | 25432 |
5 | 20513 | 20513 |
6 | 16856 | 16856 |
7 | 14365 | 14365 |
8 | 12780 | 12780 |
9 | 11436 | 11436 |
10 | 10353 | 10353 |
11 | 9108 | 9108 |
12 | 8528 | 8527 |
13 | 8039 | 8039 |
14 | 7259 | 7259 |
15 | 6931 | 6929 |
16 | 6377 | 6377 |
17 | 6028 | 6027 |
18 | 5704 | 5702 |
19 | 5332 | 5327 |
20 | 5050 | 5047 |
21 | 5002 | 5002 |