
% ./zipf-gen.pl 10000 | head -1000000 > samp.txt % head samp.txt 32 2 105 3 33 586 1 1 924 247
#!/usr/bin/perl
use strict;
use warnings;
my $gamma = 0.005;
my %count;
my $N;
while (<>) {
chomp;
next if /^\s*$/;
$count{$_}++;
$N++;
}
foreach my $k (sort {$count{$b} <=> $count{$a}} keys %count) {
last if $count{$k} < $gamma * $N;
print "$k $count{$k}\n";
}
print "max hash size = ".scalar(keys %count)."\n";
% ./naive-freq.pl samp.txt 1 102069 2 51389 3 33944 4 25432 5 20513 6 16856 7 14365 8 12780 9 11436 10 10353 11 9108 12 8528 13 8039 14 7259 15 6931 16 6377 17 6028 18 5704 19 5332 20 5050 21 5002 max hash size = 10000
#!/usr/bin/perl
use strict;
use warnings;
my $gamma = 0.005;
my $epsilon = 0.004;
my $b_current = 1;
my %count;
my $N;
my $max_hash_size = 0;
while (<>) {
chomp;
next if /^\s*$/;
add($_);
$N++;
if ($N % int(1/$epsilon) == 0) {
next_backet();
}
}
foreach my $k (sort {$count{$b}{f} <=> $count{$a}{f}} keys %count) {
if ($count{$k}{f} >= ($gamma - $epsilon) * $N) {
print "$k $count{$k}{f} $count{$k}{d}\n";
}
}
print "max hash size = $max_hash_size\n";
sub add {
my ($k) = @_;
if (not $count{$k}) {
$count{$k}{f} = 1;
$count{$k}{d} = $b_current - 1;
} else {
$count{$k}{f}++;
}
}
sub next_backet {
my $hash_size = scalar(keys %count);
$max_hash_size = $hash_size if $max_hash_size < $hash_size;
foreach my $k (keys %count) {
if ($count{$k}{f} <= $b_current - $count{$k}{d}) {
delete $count{$k};
}
}
$b_current++;
}
% ./lcm-freq.pl samp.txt 1 102069 0 2 51389 0 3 33944 0 4 25432 0 5 20513 0 6 16856 0 7 14365 0 8 12780 1 9 11436 0 10 10353 0 11 9108 0 12 8527 1 13 8039 0 14 7259 0 15 6929 4 16 6377 1 17 6027 1 18 5702 2 19 5327 5 20 5047 6 21 5002 0 22 4601 16 23 4379 11 24 3945 344 25 3632 421 max hash size = 203
| 単純カウント | LCMカウント | |
|---|---|---|
| 最大ハッシュサイズ | 10000 | 203 |
| ラベル | 頻度(単純カウント) | 頻度(LCMカウント) |
|---|---|---|
| 1 | 102069 | 102069 |
| 2 | 51389 | 51389 |
| 3 | 33944 | 33944 |
| 4 | 25432 | 25432 |
| 5 | 20513 | 20513 |
| 6 | 16856 | 16856 |
| 7 | 14365 | 14365 |
| 8 | 12780 | 12780 |
| 9 | 11436 | 11436 |
| 10 | 10353 | 10353 |
| 11 | 9108 | 9108 |
| 12 | 8528 | 8527 |
| 13 | 8039 | 8039 |
| 14 | 7259 | 7259 |
| 15 | 6931 | 6929 |
| 16 | 6377 | 6377 |
| 17 | 6028 | 6027 |
| 18 | 5704 | 5702 |
| 19 | 5332 | 5327 |
| 20 | 5050 | 5047 |
| 21 | 5002 | 5002 |
