69 件 見つかりました。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 [ 次へ ]
複数のクエリ "A120","A280","B020" が一度に与えられて、これらが行頭にマッチする行を全て取得する、という感じ。A102[tab]2022/01/01[tab]2022/02/11[tab]2387 A120[tab]2022/02/20[tab]2023/12/31[tab]100 A280[tab]2022/03/01[tab]2022/03/02[tab]89 B007[tab]2022/04/05[tab]2022/08/29[tab]980 B010[tab]2022/05/01[tab]2022/05/10[tab]12 C763[tab]2023/01/01[tab]2023/06/30[tab]7800 ...
wc current.tsv 1207384 12954296 857036094 current.tsv
B00AE090AL[tab]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX[tab]XXXX... B00J7A10QC[tab]XXXXXXXXXXXXXXX[tab]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX... B00JJFB0EC[tab]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX[tab]XXXXXXXXX...
cut -f1 current.tsv | shuffle.pl | head -10 | sort > sample-asins-10.txt cut -f1 current.tsv | shuffle.pl | head -100 | sort > sample-asins-100.txt cut -f1 current.tsv | shuffle.pl | head -1000 | sort > sample-asins-1k.txt cut -f1 current.tsv | shuffle.pl | head -10000 | sort > sample-asins-10k.txt cut -f1 current.tsv | shuffle.pl | head -100000 | sort > sample-asins-100k.txt
SUFARYについてはここでは説明しないmkary -l current.tsv
#!/usr/bin/env zsh
while read line
do
key=`echo $line | cut -f1`
if [ $key ]; then
up=`look $key $1`
#up=`sass $key $1` # look-sass.sh
if [ $up ]; then
echo $up
continue
fi
fi
done
#!/usr/bin/perl
use strict;
use warnings;
use Search::Dict;
my $fn = shift;
open(my $fh, "<", $fn) or die "can't open [$fn]";
while (<>) {
chomp;
next if /^\s*$/;
look $fh, $_;
my $line = readline($fh);
print $line if $line =~ /^\Q$_\E/;
}
close($fh);
| 評価対象ツール \ クエリ数 | 10 | 100 | 1000 | 1万 | 10万 |
|---|---|---|---|---|---|
| look.sh (look) | 0.04 | 0.35 | 3.6 | 35.1 | |
| look-sass.sh (SUFARY) | 0.23 | 2.86 | 28.6 | 267.2 | |
| look.pl (Search::Dict) | 0.01 | 0.03 | 0.1 | 0.9 | 8.7 |
| looks.py (pure Python) | 0.04 | 0.08 | 0.6 | 5.6 | 54.3 |
| join | 11.1 | 11.2 | 11.4 | 11.6 | 11.6 |
| perl-regexp | 1.3 | 1.3 | 1.3 | 1.4 |
[look:10]
time (cat sample-asins-10.txt | ./look.sh current.tsv > a1)
0.01s user 0.04s system 132% cpu 0.038 total
[SUFARY(sass):10]
time (cat sample-asins-10.txt | ./look-sass.sh current.tsv > a2)
0.01s user 0.26s system 104% cpu 0.259 total
[look(Search::Dict):10]
time (cat sample-asins-10.txt | ./look.pl current.tsv > a3)
0.01s user 0.01s system 123% cpu 0.010 total
[join:10]
time (cat sample-asins-10.txt | join -t$'\t' - current.tsv > a4)
10.92s user 0.15s system 99% cpu 11.067 total
[perl-regexp:10]
PAT=`cat sample-asins-10.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.09s user 0.21s system 99% cpu 1.307 total
[look:100]
time (cat sample-asins-100.txt | ./look.sh current.tsv > a1)
0.04s user 0.37s system 117% cpu 0.350 total
[SUFARY(sass):100]
time (cat sample-asins-100.txt | ./look-sass.sh current.tsv > a2)
0.12s user 2.82s system 103% cpu 2.857 total
[look(Search::Dict):100]
time (cat sample-asins-100.txt | ./look.pl current.tsv > a3)
0.02s user 0.02s system 143% cpu 0.026 total
[join:100]
time (join -t$'\t' sample-asins-100.txt current.tsv > a4)
10.98s user 0.19s system 99% cpu 11.173 total
[perl-regexp:100]
PAT=`cat sample-asins-100.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.10s user 0.22s system 99% cpu 1.318 total
[look:1000]
time (cat sample-asins-1k.txt | ./look.sh current.tsv > a1)
0.95s user 3.72s system 131% cpu 3.553 total
[SUFARY(sass):1000]
time (cat sample-asins-1k.txt | ./look-sass.sh current.tsv > a2)
0.98s user 28.41s system 102% cpu 28.585 total
[look(Search::Dict):1000]
time (cat sample-asins-1k.txt | ./look.pl current.tsv > a3)
0.09s user 0.03s system 105% cpu 0.109 total
[join:1000]
time (join -t$'\t' sample-asins-1k.txt current.tsv > a4)
11.25s user 0.18s system 99% cpu 11.440 total
[perl-regexp:1000]
PAT=`cat sample-asins-1k.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.09s user 0.20s system 99% cpu 1.292 total
[look:10000]
time (cat sample-asins-10k.txt | ./look.sh current.tsv > a1)
8.73s user 36.74s system 129% cpu 35.129 total
[SUFARY(sass):10000]
time (cat sample-asins-10k.txt | ./look-sass.sh current.tsv > a2)
9.68s user 265.59s system 103% cpu 4:27.22 total
[look(Search::Dict):10000]
time (cat sample-asins-10k.txt | ./look.pl current.tsv > a3)
0.58s user 0.32s system 100% cpu 0.906 total
[join:10000]
time (join -t$'\t' sample-asins-10k.txt current.tsv > a4)
11.45s user 0.17s system 99% cpu 11.623 total
[perl-regexp:10000]
PAT=`cat sample-asins-10k.txt | xargs | sed 's/ /|/g'`
time (perl -nle 'print if /^('$PAT')/' current.tsv > a5)
1.19s user 0.19s system 99% cpu 1.377 total
[look(Search::Dict):100000] time (cat sample-asins-100k.txt | ./look.pl current.tsv > a7) 6.49s user 2.24s system 99% cpu 8.734 total [join:100000] time (join -t$'\t' sample-asins-100k.txt current.tsv > a8) 11.32s user 0.23s system 99% cpu 11.562 total
( cat sample-asins-10.txt | ./looks.py current.tsv > a6; ) 0.02s user 0.02s system 107% cpu 0.036 total ( cat sample-asins-100.txt | ./looks.py current.tsv > a6; ) 0.05s user 0.03s system 105% cpu 0.082 total ( cat sample-asins-1k.txt | ./looks.py current.tsv > a6; ) 0.44s user 0.16s system 100% cpu 0.600 total ( cat sample-asins-10k.txt | ./looks.py current.tsv > a6; ) 4.25s user 1.33s system 99% cpu 5.582 total ( cat sample-asins-100k.txt | ./looks.py current.tsv > a6; ) 41.67s user 12.60s system 99% cpu 54.311 total
#!/usr/bin/perl use strict; use warnings; use Encode; use utf8; use open ':utf8'; binmode STDIN, ':utf8'; binmode STDOUT, ':utf8'; use SUFARY; use Getopt::Long; my $answer_mode = 0; # input with answer? my $debug_mode = 0; GetOptions ( "answer" => \$answer_mode, 'debug' => \$debug_mode, ); my $wordset_fn = shift; my $sa = SUFARY->new($wordset_fn); while (<>) { print "[INPUT] $_" if $debug_mode; chomp; $_ = Encode::decode_utf8($_) if not utf8::is_utf8($_); my $ans = ($_ =~ s/^((.+?)\t)//) ? $2 : "" if $answer_mode; my @c = split(//, $_); my %m; for (my $i = 0; $i < @c; $i++) { my $key; my ($left, $right) = (0, $sa->{'arraysize'}-1); for (my $j = $i; $j < @c; $j++) { $key .= $c[$j]; my $ekey = Encode::encode('utf8', $key); ($left, $right) = $sa->range_search($ekey, $left, $right); last if not defined $left and not defined $right; my ($l, $r) = $sa->range_search($ekey."\t", $left, $right); next if not defined $left and not defined $right; if ($r - $l >= 0) { my $li = $sa->get_position($l); my $s = Encode::decode_utf8($sa->get_line($li)); my ($k, $v) = $s =~ /^(.+)\t(.+)$/; print "[MATCH] $k ($v)\n" if $debug_mode; $m{$v}++; } } } print "$ans "if $answer_mode; print join(" ", map {"$_:$m{$_}"} sort {$a <=> $b} keys %m)."\n"; # print join(" ", map {"$_:1"} sort {$a <=> $b} keys %m)."\n"; }
% mkary -l -q fepp-dic.txt % ./fesa.pl -a fepp-dic.txt < fepp-test.txt 1 1:1 2:1 3:1 0 2:1 3:1 4:1 1 5:1 6:1 7:1 8:1 0 7:1 9:1
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
my $dic_fn = shift @ARGV;
my %token;
open(my $fh, "<:utf8", $dic_fn) or die;
while (<$fh>) {
chomp;
next if /^\s*$/;
my ($k, $c) = split(/\t/, $_, 2);
$token{length($k)}{$k} = $c;
}
close($fh);
my %pat;
foreach my $i (keys %token) {
$pat{$i} = join("|", keys %{$token{$i}});
}
while (<>) {
print "> $_";
chomp;
next if /^\s*$/;
my $s_ref = get_matched_strings(\%pat, $_);
print join("\n", map {"$_\t".$token{length($_)}{$_}} sort keys %$s_ref)."\n";
}
sub get_matched_strings {
my ($pat_ref, $text) = @_;
my %match;
for (my $i = 0; $i < length($text); $i++) {
foreach my $len (keys %{$pat_ref}) {
$match{$1}++ if $text =~ /^.{$i}($pat_ref->{$len})/;
}
}
return \%match;
};
あいう r:aiu あい r:ai,k:愛 いえ r:ie,k:家 いうえ r:iue うあい r:uai えあ r:ea おい r:oi,k:甥
% echo 'あいうえおい' | ./regexdic.pl aiueo.dic > あいうえおい あい r:ai,k:愛 あいう r:aiu いうえ r:iue おい r:oi,k:甥
$pat_all = join("|", sort {length($b) <=> length($a)} keys %token_all);
#!/usr/bin/perl -T
use strict;
use warnings;
use CGI;
use HTML::Template;
my $filename = "test.txt";
my $n = 10;
my $q = new CGI;
my $from = $q->param('f') || 1;
my $next_line = $from + $n;
my $pre_line = ($from - $n > 1) ? $from - $n : 1;
my $key_org = $q->param('key') || "";
my $key = quotemeta $key_org;
$key =~ s/[<>]//g;
my $url = $q->url(-query => 1);
$url =~ s/[;&]f=(\d+)//;
print $q->header(-charset => 'UTF-8');
my $str = "";
if (not $key =~ /^\s*$/) {
if (open(my $fh, "<", $filename)) {
my $count = 0;
while (<$fh>) {
my $line = $_;
next if not $line =~ s|($key)|<font color="red">$1</font>|g;
$count++;
next if $count < $from;
last if $count >= $from + $n;
$str .= "$count: ".$line;
}
close($fh);
$str = "NOT FOUND" if $str eq "";
} else {
$str = "ERROR: Can't open '$filename'";
}
}
my $template = join("", <DATA>);
my $t = HTML::Template->new(scalarref => \$template,
global_vars => 1,
die_on_bad_params => 0);
$t->param(from => $from);
$t->param(str => $str);
$t->param(next_line => $next_line);
$t->param(pre_line => $pre_line);
$t->param(path => $url);
$t->param(key => $key_org);
print $t->output();
__DATA__
<html lang="ja">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Text Search</title>
</head>
<body>
<form action="" method="get">
<input type="text" name="key" value="<TMPL_VAR name=key>">
<input type="submit">
</form>
<TMPL_IF name=str>
<a href="<TMPL_VAR name=path>&f=<TMPL_VAR name=pre_line>"><<</a>
<a href="<TMPL_VAR name=path>&f=<TMPL_VAR name=next_line>">>></a>
<hr>
<pre><TMPL_VAR name=str></pre>
<hr>
<a href="<TMPL_VAR name=path>&f=<TMPL_VAR name=pre_line>"><<</a>
<a href="<TMPL_VAR name=path>&f=<TMPL_VAR name=next_line>">>></a>
</TMPL_IF>
</body>
</html>
#!/usr/bin/perl -T use strict; use warnings; use CGI; use SUFARY; use Encode; use URI::Escape; use HTML::Template; use utf8; binmode STDOUT, ":utf8"; my $fn = "test-dic.txt"; my $sa = SUFARY->new($fn); my $q = new CGI; my $key = $q->param('key'); my $start = $q->param('start') || 1; my $num = $q->param('num') || 10; my $r_ref = search($sa, $key); my $template = join("", <DATA>); my $t = HTML::Template->new(scalarref => \$template, global_vars => 1, die_on_bad_params => 0); $t->param(key => $key); $t->param(ekey => URI::Escape::uri_escape($key)); $t->param(results => $r_ref->{cont}) if %$r_ref; $t->param(pre => $r_ref->{pre}) if %$r_ref; $t->param(nex => $r_ref->{nex}) if %$r_ref; print $q->header(-charset => 'UTF-8'), decode('utf-8', $t->output()); sub search { my ($sa, $key) = @_; return {} if $key eq ""; my ($left, $right) = $sa->range_search($key); return {} if not defined $left and not defined $right; my $n = $right - $left + 1; my $from = $left + $start - 1; return {} if $right < $from; my $to = $from + $num - 1; $to = $right if $to > $right; my @rv; for (my $k = $from; $k <= $to; $k++) { my $pos = $sa->get_position($k); my $str = $sa->get_line($pos); push @rv, {line => $str}; } my $pre = ($start - $num > 0) ? $start - $num : 0; my $nex = ($start + $num <= $n) ? $start + $num : 0; return {cont => \@rv, pre => $pre, nex => $nex}; } __DATA__ <html lang="ja"> <head> <meta http-equiv="Content-Type" contet="text/html; charset=UTF-8"> <title></title> </head> <body> <h1></h1> <form> <input type="input" name="key" size="30" value="<TMPL_VAR name=key>"> <input type="submit"> </form> <TMPL_IF name=results> <TMPL_LOOP name=results> <TMPL_VAR name=line><br> </TMPL_LOOP> </TMPL_IF> <TMPL_IF name=pre> <a href="?key=<TMPL_VAR name=ekey>&start=<TMPL_VAR name=pre>"><<</a> </TMPL_IF> <TMPL_IF name=nex> <a href="?key=<TMPL_VAR name=ekey>&start=<TMPL_VAR name=nex>">>></a> </TMPL_IF> </body> </html>
「-l」オプションで行頭にインデックスを張ります。% tail -5 test-dic.txt 龍尾神社 龍滕 LONG TENG(赤坂) 1万円入りま〜す 1日なのでお休みです Tシャツ・ラブ・サミットでTシャツを買ってきた! % mkary -l test-dic.txt

1 2 3 4 5 6 7 8 9 10 11 12 13 14 [ 次へ ]
たつをの ChangeLog