複数のクエリ "A120","A280","B020" が一度に与えられて、これらが行頭にマッチする行を全て取得する、という感じ。A102[tab]2022/01/01[tab]2022/02/11[tab]2387 A120[tab]2022/02/20[tab]2023/12/31[tab]100 A280[tab]2022/03/01[tab]2022/03/02[tab]89 B007[tab]2022/04/05[tab]2022/08/29[tab]980 B010[tab]2022/05/01[tab]2022/05/10[tab]12 C763[tab]2023/01/01[tab]2023/06/30[tab]7800 ...
wc current.tsv 1207384 12954296 857036094 current.tsv
B00AE090AL[tab]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX[tab]XXXX... B00J7A10QC[tab]XXXXXXXXXXXXXXX[tab]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX... B00JJFB0EC[tab]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX[tab]XXXXXXXXX...
cut -f1 current.tsv | shuffle.pl | head -10 | sort > sample-asins-10.txt cut -f1 current.tsv | shuffle.pl | head -100 | sort > sample-asins-100.txt cut -f1 current.tsv | shuffle.pl | head -1000 | sort > sample-asins-1k.txt cut -f1 current.tsv | shuffle.pl | head -10000 | sort > sample-asins-10k.txt cut -f1 current.tsv | shuffle.pl | head -100000 | sort > sample-asins-100k.txt
SUFARYについてはここでは説明しないmkary -l current.tsv
#!/usr/bin/env zsh
while read line
do
key=`echo $line | cut -f1`
if [ $key ]; then
up=`look $key $1`
#up=`sass $key $1` # look-sass.sh
if [ $up ]; then
echo $up
continue
fi
fi
done
#!/usr/bin/perl
use strict;
use warnings;
use Search::Dict;
my $fn = shift;
open(my $fh, "<", $fn) or die "can't open [$fn]";
while (<>) {
chomp;
next if /^\s*$/;
look $fh, $_;
my $line = readline($fh);
print $line if $line =~ /^\Q$_\E/;
}
close($fh);
| 評価対象ツール \ クエリ数 | 10 | 100 | 1000 | 1万 | 10万 |
|---|---|---|---|---|---|
| look.sh (look) | 0.04 | 0.35 | 3.6 | 35.1 | |
| look-sass.sh (SUFARY) | 0.23 | 2.86 | 28.6 | 267.2 | |
| look.pl (Search::Dict) | 0.01 | 0.03 | 0.1 | 0.9 | 8.7 |
| looks.py (pure Python) | 0.04 | 0.08 | 0.6 | 5.6 | 54.3 |
| join | 11.1 | 11.2 | 11.4 | 11.6 | 11.6 |
| perl-regexp | 1.3 | 1.3 | 1.3 | 1.4 |
[look:10]
time (cat sample-asins-10.txt | ./look.sh current.tsv > a1)
0.01s user 0.04s system 132% cpu 0.038 total
[SUFARY(sass):10]
time (cat sample-asins-10.txt | ./look-sass.sh current.tsv > a2)
0.01s user 0.26s system 104% cpu 0.259 total
[look(Search::Dict):10]
time (cat sample-asins-10.txt | ./look.pl current.tsv > a3)
0.01s user 0.01s system 123% cpu 0.010 total
[join:10]
time (cat sample-asins-10.txt | join -t$'\t' - current.tsv > a4)
10.92s user 0.15s system 99% cpu 11.067 total
[perl-regexp:10]
PAT=`cat sample-asins-10.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.09s user 0.21s system 99% cpu 1.307 total
[look:100]
time (cat sample-asins-100.txt | ./look.sh current.tsv > a1)
0.04s user 0.37s system 117% cpu 0.350 total
[SUFARY(sass):100]
time (cat sample-asins-100.txt | ./look-sass.sh current.tsv > a2)
0.12s user 2.82s system 103% cpu 2.857 total
[look(Search::Dict):100]
time (cat sample-asins-100.txt | ./look.pl current.tsv > a3)
0.02s user 0.02s system 143% cpu 0.026 total
[join:100]
time (join -t$'\t' sample-asins-100.txt current.tsv > a4)
10.98s user 0.19s system 99% cpu 11.173 total
[perl-regexp:100]
PAT=`cat sample-asins-100.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.10s user 0.22s system 99% cpu 1.318 total
[look:1000]
time (cat sample-asins-1k.txt | ./look.sh current.tsv > a1)
0.95s user 3.72s system 131% cpu 3.553 total
[SUFARY(sass):1000]
time (cat sample-asins-1k.txt | ./look-sass.sh current.tsv > a2)
0.98s user 28.41s system 102% cpu 28.585 total
[look(Search::Dict):1000]
time (cat sample-asins-1k.txt | ./look.pl current.tsv > a3)
0.09s user 0.03s system 105% cpu 0.109 total
[join:1000]
time (join -t$'\t' sample-asins-1k.txt current.tsv > a4)
11.25s user 0.18s system 99% cpu 11.440 total
[perl-regexp:1000]
PAT=`cat sample-asins-1k.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.09s user 0.20s system 99% cpu 1.292 total
[look:10000]
time (cat sample-asins-10k.txt | ./look.sh current.tsv > a1)
8.73s user 36.74s system 129% cpu 35.129 total
[SUFARY(sass):10000]
time (cat sample-asins-10k.txt | ./look-sass.sh current.tsv > a2)
9.68s user 265.59s system 103% cpu 4:27.22 total
[look(Search::Dict):10000]
time (cat sample-asins-10k.txt | ./look.pl current.tsv > a3)
0.58s user 0.32s system 100% cpu 0.906 total
[join:10000]
time (join -t$'\t' sample-asins-10k.txt current.tsv > a4)
11.45s user 0.17s system 99% cpu 11.623 total
[perl-regexp:10000]
PAT=`cat sample-asins-10k.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.19s user 0.19s system 99% cpu 1.377 total
[look(Search::Dict):100000] time (cat sample-asins-100k.txt | ./look.pl current.tsv > a7) 6.49s user 2.24s system 99% cpu 8.734 total [join:100000] time (join -t$'\t' sample-asins-100k.txt current.tsv > a8) 11.32s user 0.23s system 99% cpu 11.562 total
( cat sample-asins-10.txt | ./looks.py current.tsv > a6; ) 0.02s user 0.02s system 107% cpu 0.036 total ( cat sample-asins-100.txt | ./looks.py current.tsv > a6; ) 0.05s user 0.03s system 105% cpu 0.082 total ( cat sample-asins-1k.txt | ./looks.py current.tsv > a6; ) 0.44s user 0.16s system 100% cpu 0.600 total ( cat sample-asins-10k.txt | ./looks.py current.tsv > a6; ) 4.25s user 1.33s system 99% cpu 5.582 total ( cat sample-asins-100k.txt | ./looks.py current.tsv > a6; ) 41.67s user 12.60s system 99% cpu 54.311 total