




データサイズは130Mバイト。53万エントリ。ヨーグルト ダヒ:4 スメタナ:12 イリヤ・メチニコフ:17 腸内細菌:39 ... 鶴巻温泉 陣屋事件:7 鶴巻温泉駅:13 秦野市:197 ... 麦芽 糖化:27 発芽:41 水飴:70 酵素:451 ビール:545 ...
#!/usr/bin/perl
# usage: bunzip2 -c jawiki-latest-pages-articles.xml.bz2|prog.pl>wp-rel.txt
use strict;
use warnings;
use utf8;
use Encode;
use open ':utf8';
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
my %title_word;
my %df;
my $flag_page_inside = 0;
my $page;
open(F, ">:utf8", "./tmp-removeme.txt") or die "can't open a tmpfile";
while (<>) {
my $line = $_;
if ($flag_page_inside == 0 and $line =~ m{<page>}) {
$flag_page_inside = 1;
}
if ($flag_page_inside == 1) {
$page .= $line;
}
if ($flag_page_inside == 1 and $line =~ m{</page>}) {
my ($title) = $page =~ m{<title>([^<]+)<};
$title = word_ok($title);
if ($title ne "" and not defined $title_word{$title}) {
$title_word{$title} = 1;
$df{$title}++;
my %relwords;
while ($page =~ m{\[\[(.+?)\]\]}g) {
my $w = word_ok($1);
next if $w eq "";
next if $w eq $title;
next if defined $relwords{$w};
$relwords{$w}++;
$df{$w}++;
print F "$title\t$w\n";
print F "$w\t$title\n";
}
}
$flag_page_inside = 0;
$page = "";
}
}
close F;
sub word_ok {
my ($w) = @_;
return "" unless $w;
return "" if $w =~ m{[:\|\n\t\[\]]};;
return "" if $w =~ m{^[\#\s]};;
$w =~ s/\s*\(.+?\)$//;
return "" if $w =~ /一覧$/;
return "" if $w =~ /\d+月\d+日$/;
return "" if $w =~ /^\d{4}年$/;
return $w;
}
my @list;
my $preword = "";
open(F, "-|:utf8", "sort ./tmp-removeme.txt") or die "can't open a tmpfile";
while (<F>) {
chomp;
my @c = split("\t", $_);
next if (@c != 2);
if ($preword ne $c[0]) {
calc_rel($preword, \@list);
$preword = $c[0];
@list = ();
}
next unless defined $title_word{$c[0]};
next unless defined $title_word{$c[1]};
push @list, $c[1];
}
close F;
calc_rel($preword, \@list) if @list > 0;
sub calc_rel {
my ($title, $lp) = @_;
return if @$lp == 0;
my %count;
foreach my $w (@$lp) {
$count{$w}++;
}
my @rws = map {"$_:$df{$_}"} sort {
$df{$a}/(($count{$a}-1)*1000000+1)
<=> $df{$b}/(($count{$b}-1)*1000000+1)
} keys %count;
@rws = @rws[0..20] if @rws > 20;
print "$title\t", join("\t", @rws), "\n";
}
(ライセンス=「無償・無保証・著作権放棄」