#!/usr/bin/perl use strict; use warnings; use utf8; use open ':utf8'; binmode STDIN, ":utf8"; binmode STDOUT, ":utf8"; my $dic_fn = shift @ARGV; my %token; open(my $fh, "<:utf8", $dic_fn) or die; while (<$fh>) { chomp; next if /^\s*$/; my ($k, $c) = split(/\t/, $_, 2); $token{length($k)}{$k} = $c; } close($fh); my %pat; foreach my $i (keys %token) { $pat{$i} = join("|", keys %{$token{$i}}); } while (<>) { print "> $_"; chomp; next if /^\s*$/; my $s_ref = get_matched_strings(\%pat, $_); print join("\n", map {"$_\t".$token{length($_)}{$_}} sort keys %$s_ref)."\n"; } sub get_matched_strings { my ($pat_ref, $text) = @_; my %match; for (my $i = 0; $i < length($text); $i++) { foreach my $len (keys %{$pat_ref}) { $match{$1}++ if $text =~ /^.{$i}($pat_ref->{$len})/; } } return \%match; };
あいう r:aiu あい r:ai,k:愛 いえ r:ie,k:家 いうえ r:iue うあい r:uai えあ r:ea おい r:oi,k:甥
% echo 'あいうえおい' | ./regexdic.pl aiueo.dic > あいうえおい あい r:ai,k:愛 あいう r:aiu いうえ r:iue おい r:oi,k:甥
$pat_all = join("|", sort {length($b) <=> length($a)} keys %token_all);