69 件 見つかりました。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 [ 次へ ]
複数のクエリ "A120","A280","B020" が一度に与えられて、これらが行頭にマッチする行を全て取得する、という感じ。A102[tab]2022/01/01[tab]2022/02/11[tab]2387 A120[tab]2022/02/20[tab]2023/12/31[tab]100 A280[tab]2022/03/01[tab]2022/03/02[tab]89 B007[tab]2022/04/05[tab]2022/08/29[tab]980 B010[tab]2022/05/01[tab]2022/05/10[tab]12 C763[tab]2023/01/01[tab]2023/06/30[tab]7800 ...
wc current.tsv 1207384 12954296 857036094 current.tsv
B00AE090AL[tab]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX[tab]XXXX... B00J7A10QC[tab]XXXXXXXXXXXXXXX[tab]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX... B00JJFB0EC[tab]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX[tab]XXXXXXXXX...
cut -f1 current.tsv | shuffle.pl | head -10 | sort > sample-asins-10.txt cut -f1 current.tsv | shuffle.pl | head -100 | sort > sample-asins-100.txt cut -f1 current.tsv | shuffle.pl | head -1000 | sort > sample-asins-1k.txt cut -f1 current.tsv | shuffle.pl | head -10000 | sort > sample-asins-10k.txt cut -f1 current.tsv | shuffle.pl | head -100000 | sort > sample-asins-100k.txt
SUFARYについてはここでは説明しないmkary -l current.tsv
#!/usr/bin/env zsh while read line do key=`echo $line | cut -f1` if [ $key ]; then up=`look $key $1` #up=`sass $key $1` # look-sass.sh if [ $up ]; then echo $up continue fi fi done
#!/usr/bin/perl use strict; use warnings; use Search::Dict; my $fn = shift; open(my $fh, "<", $fn) or die "can't open [$fn]"; while (<>) { chomp; next if /^\s*$/; look $fh, $_; my $line = readline($fh); print $line if $line =~ /^\Q$_\E/; } close($fh);
評価対象ツール \ クエリ数 | 10 | 100 | 1000 | 1万 | 10万 |
---|---|---|---|---|---|
look.sh (look) | 0.04 | 0.35 | 3.6 | 35.1 | |
look-sass.sh (SUFARY) | 0.23 | 2.86 | 28.6 | 267.2 | |
look.pl (Search::Dict) | 0.01 | 0.03 | 0.1 | 0.9 | 8.7 |
looks.py (pure Python) | 0.04 | 0.08 | 0.6 | 5.6 | 54.3 |
join | 11.1 | 11.2 | 11.4 | 11.6 | 11.6 |
perl-regexp | 1.3 | 1.3 | 1.3 | 1.4 |
[look:10]
time (cat sample-asins-10.txt | ./look.sh current.tsv > a1)
0.01s user 0.04s system 132% cpu 0.038 total
[SUFARY(sass):10]
time (cat sample-asins-10.txt | ./look-sass.sh current.tsv > a2)
0.01s user 0.26s system 104% cpu 0.259 total
[look(Search::Dict):10]
time (cat sample-asins-10.txt | ./look.pl current.tsv > a3)
0.01s user 0.01s system 123% cpu 0.010 total
[join:10]
time (cat sample-asins-10.txt | join -t$'\t' - current.tsv > a4)
10.92s user 0.15s system 99% cpu 11.067 total
[perl-regexp:10]
PAT=`cat sample-asins-10.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.09s user 0.21s system 99% cpu 1.307 total
[look:100]
time (cat sample-asins-100.txt | ./look.sh current.tsv > a1)
0.04s user 0.37s system 117% cpu 0.350 total
[SUFARY(sass):100]
time (cat sample-asins-100.txt | ./look-sass.sh current.tsv > a2)
0.12s user 2.82s system 103% cpu 2.857 total
[look(Search::Dict):100]
time (cat sample-asins-100.txt | ./look.pl current.tsv > a3)
0.02s user 0.02s system 143% cpu 0.026 total
[join:100]
time (join -t$'\t' sample-asins-100.txt current.tsv > a4)
10.98s user 0.19s system 99% cpu 11.173 total
[perl-regexp:100]
PAT=`cat sample-asins-100.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.10s user 0.22s system 99% cpu 1.318 total
[look:1000]
time (cat sample-asins-1k.txt | ./look.sh current.tsv > a1)
0.95s user 3.72s system 131% cpu 3.553 total
[SUFARY(sass):1000]
time (cat sample-asins-1k.txt | ./look-sass.sh current.tsv > a2)
0.98s user 28.41s system 102% cpu 28.585 total
[look(Search::Dict):1000]
time (cat sample-asins-1k.txt | ./look.pl current.tsv > a3)
0.09s user 0.03s system 105% cpu 0.109 total
[join:1000]
time (join -t$'\t' sample-asins-1k.txt current.tsv > a4)
11.25s user 0.18s system 99% cpu 11.440 total
[perl-regexp:1000]
PAT=`cat sample-asins-1k.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.09s user 0.20s system 99% cpu 1.292 total
[look:10000]
time (cat sample-asins-10k.txt | ./look.sh current.tsv > a1)
8.73s user 36.74s system 129% cpu 35.129 total
[SUFARY(sass):10000]
time (cat sample-asins-10k.txt | ./look-sass.sh current.tsv > a2)
9.68s user 265.59s system 103% cpu 4:27.22 total
[look(Search::Dict):10000]
time (cat sample-asins-10k.txt | ./look.pl current.tsv > a3)
0.58s user 0.32s system 100% cpu 0.906 total
[join:10000]
time (join -t$'\t' sample-asins-10k.txt current.tsv > a4)
11.45s user 0.17s system 99% cpu 11.623 total
[perl-regexp:10000]
PAT=`cat sample-asins-10k.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.19s user 0.19s system 99% cpu 1.377 total
[look(Search::Dict):100000] time (cat sample-asins-100k.txt | ./look.pl current.tsv > a7) 6.49s user 2.24s system 99% cpu 8.734 total [join:100000] time (join -t$'\t' sample-asins-100k.txt current.tsv > a8) 11.32s user 0.23s system 99% cpu 11.562 total
( cat sample-asins-10.txt | ./looks.py current.tsv > a6; ) 0.02s user 0.02s system 107% cpu 0.036 total ( cat sample-asins-100.txt | ./looks.py current.tsv > a6; ) 0.05s user 0.03s system 105% cpu 0.082 total ( cat sample-asins-1k.txt | ./looks.py current.tsv > a6; ) 0.44s user 0.16s system 100% cpu 0.600 total ( cat sample-asins-10k.txt | ./looks.py current.tsv > a6; ) 4.25s user 1.33s system 99% cpu 5.582 total ( cat sample-asins-100k.txt | ./looks.py current.tsv > a6; ) 41.67s user 12.60s system 99% cpu 54.311 total
#!/usr/bin/perl use strict; use warnings; use Encode; use utf8; use open ':utf8'; binmode STDIN, ':utf8'; binmode STDOUT, ':utf8'; use SUFARY; use Getopt::Long; my $answer_mode = 0; # input with answer? my $debug_mode = 0; GetOptions ( "answer" => \$answer_mode, 'debug' => \$debug_mode, ); my $wordset_fn = shift; my $sa = SUFARY->new($wordset_fn); while (<>) { print "[INPUT] $_" if $debug_mode; chomp; $_ = Encode::decode_utf8($_) if not utf8::is_utf8($_); my $ans = ($_ =~ s/^((.+?)\t)//) ? $2 : "" if $answer_mode; my @c = split(//, $_); my %m; for (my $i = 0; $i < @c; $i++) { my $key; my ($left, $right) = (0, $sa->{'arraysize'}-1); for (my $j = $i; $j < @c; $j++) { $key .= $c[$j]; my $ekey = Encode::encode('utf8', $key); ($left, $right) = $sa->range_search($ekey, $left, $right); last if not defined $left and not defined $right; my ($l, $r) = $sa->range_search($ekey."\t", $left, $right); next if not defined $left and not defined $right; if ($r - $l >= 0) { my $li = $sa->get_position($l); my $s = Encode::decode_utf8($sa->get_line($li)); my ($k, $v) = $s =~ /^(.+)\t(.+)$/; print "[MATCH] $k ($v)\n" if $debug_mode; $m{$v}++; } } } print "$ans "if $answer_mode; print join(" ", map {"$_:$m{$_}"} sort {$a <=> $b} keys %m)."\n"; # print join(" ", map {"$_:1"} sort {$a <=> $b} keys %m)."\n"; }
% mkary -l -q fepp-dic.txt % ./fesa.pl -a fepp-dic.txt < fepp-test.txt 1 1:1 2:1 3:1 0 2:1 3:1 4:1 1 5:1 6:1 7:1 8:1 0 7:1 9:1
#!/usr/bin/perl use strict; use warnings; use utf8; use open ':utf8'; binmode STDIN, ":utf8"; binmode STDOUT, ":utf8"; my $dic_fn = shift @ARGV; my %token; open(my $fh, "<:utf8", $dic_fn) or die; while (<$fh>) { chomp; next if /^\s*$/; my ($k, $c) = split(/\t/, $_, 2); $token{length($k)}{$k} = $c; } close($fh); my %pat; foreach my $i (keys %token) { $pat{$i} = join("|", keys %{$token{$i}}); } while (<>) { print "> $_"; chomp; next if /^\s*$/; my $s_ref = get_matched_strings(\%pat, $_); print join("\n", map {"$_\t".$token{length($_)}{$_}} sort keys %$s_ref)."\n"; } sub get_matched_strings { my ($pat_ref, $text) = @_; my %match; for (my $i = 0; $i < length($text); $i++) { foreach my $len (keys %{$pat_ref}) { $match{$1}++ if $text =~ /^.{$i}($pat_ref->{$len})/; } } return \%match; };
あいう r:aiu あい r:ai,k:愛 いえ r:ie,k:家 いうえ r:iue うあい r:uai えあ r:ea おい r:oi,k:甥
% echo 'あいうえおい' | ./regexdic.pl aiueo.dic > あいうえおい あい r:ai,k:愛 あいう r:aiu いうえ r:iue おい r:oi,k:甥
$pat_all = join("|", sort {length($b) <=> length($a)} keys %token_all);
#!/usr/bin/perl -T use strict; use warnings; use CGI; use HTML::Template; my $filename = "test.txt"; my $n = 10; my $q = new CGI; my $from = $q->param('f') || 1; my $next_line = $from + $n; my $pre_line = ($from - $n > 1) ? $from - $n : 1; my $key_org = $q->param('key') || ""; my $key = quotemeta $key_org; $key =~ s/[<>]//g; my $url = $q->url(-query => 1); $url =~ s/[;&]f=(\d+)//; print $q->header(-charset => 'UTF-8'); my $str = ""; if (not $key =~ /^\s*$/) { if (open(my $fh, "<", $filename)) { my $count = 0; while (<$fh>) { my $line = $_; next if not $line =~ s|($key)|<font color="red">$1</font>|g; $count++; next if $count < $from; last if $count >= $from + $n; $str .= "$count: ".$line; } close($fh); $str = "NOT FOUND" if $str eq ""; } else { $str = "ERROR: Can't open '$filename'"; } } my $template = join("", <DATA>); my $t = HTML::Template->new(scalarref => \$template, global_vars => 1, die_on_bad_params => 0); $t->param(from => $from); $t->param(str => $str); $t->param(next_line => $next_line); $t->param(pre_line => $pre_line); $t->param(path => $url); $t->param(key => $key_org); print $t->output(); __DATA__ <html lang="ja"> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>Text Search</title> </head> <body> <form action="" method="get"> <input type="text" name="key" value="<TMPL_VAR name=key>"> <input type="submit"> </form> <TMPL_IF name=str> <a href="<TMPL_VAR name=path>&f=<TMPL_VAR name=pre_line>"><<</a> <a href="<TMPL_VAR name=path>&f=<TMPL_VAR name=next_line>">>></a> <hr> <pre><TMPL_VAR name=str></pre> <hr> <a href="<TMPL_VAR name=path>&f=<TMPL_VAR name=pre_line>"><<</a> <a href="<TMPL_VAR name=path>&f=<TMPL_VAR name=next_line>">>></a> </TMPL_IF> </body> </html>
#!/usr/bin/perl -T use strict; use warnings; use CGI; use SUFARY; use Encode; use URI::Escape; use HTML::Template; use utf8; binmode STDOUT, ":utf8"; my $fn = "test-dic.txt"; my $sa = SUFARY->new($fn); my $q = new CGI; my $key = $q->param('key'); my $start = $q->param('start') || 1; my $num = $q->param('num') || 10; my $r_ref = search($sa, $key); my $template = join("", <DATA>); my $t = HTML::Template->new(scalarref => \$template, global_vars => 1, die_on_bad_params => 0); $t->param(key => $key); $t->param(ekey => URI::Escape::uri_escape($key)); $t->param(results => $r_ref->{cont}) if %$r_ref; $t->param(pre => $r_ref->{pre}) if %$r_ref; $t->param(nex => $r_ref->{nex}) if %$r_ref; print $q->header(-charset => 'UTF-8'), decode('utf-8', $t->output()); sub search { my ($sa, $key) = @_; return {} if $key eq ""; my ($left, $right) = $sa->range_search($key); return {} if not defined $left and not defined $right; my $n = $right - $left + 1; my $from = $left + $start - 1; return {} if $right < $from; my $to = $from + $num - 1; $to = $right if $to > $right; my @rv; for (my $k = $from; $k <= $to; $k++) { my $pos = $sa->get_position($k); my $str = $sa->get_line($pos); push @rv, {line => $str}; } my $pre = ($start - $num > 0) ? $start - $num : 0; my $nex = ($start + $num <= $n) ? $start + $num : 0; return {cont => \@rv, pre => $pre, nex => $nex}; } __DATA__ <html lang="ja"> <head> <meta http-equiv="Content-Type" contet="text/html; charset=UTF-8"> <title></title> </head> <body> <h1></h1> <form> <input type="input" name="key" size="30" value="<TMPL_VAR name=key>"> <input type="submit"> </form> <TMPL_IF name=results> <TMPL_LOOP name=results> <TMPL_VAR name=line><br> </TMPL_LOOP> </TMPL_IF> <TMPL_IF name=pre> <a href="?key=<TMPL_VAR name=ekey>&start=<TMPL_VAR name=pre>"><<</a> </TMPL_IF> <TMPL_IF name=nex> <a href="?key=<TMPL_VAR name=ekey>&start=<TMPL_VAR name=nex>">>></a> </TMPL_IF> </body> </html>
「-l」オプションで行頭にインデックスを張ります。% tail -5 test-dic.txt 龍尾神社 龍滕 LONG TENG(赤坂) 1万円入りま〜す 1日なのでお休みです Tシャツ・ラブ・サミットでTシャツを買ってきた! % mkary -l test-dic.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 [ 次へ ]
たつをの ChangeLog