データサイズは130Mバイト。53万エントリ。ヨーグルト ダヒ:4 スメタナ:12 イリヤ・メチニコフ:17 腸内細菌:39 ... 鶴巻温泉 陣屋事件:7 鶴巻温泉駅:13 秦野市:197 ... 麦芽 糖化:27 発芽:41 水飴:70 酵素:451 ビール:545 ...
(ライセンス=「無償・無保証・著作権放棄」#!/usr/bin/perl # usage: bunzip2 -c jawiki-latest-pages-articles.xml.bz2|prog.pl>wp-rel.txt use strict; use warnings; use utf8; use Encode; use open ':utf8'; binmode STDIN, ":utf8"; binmode STDOUT, ":utf8"; my %title_word; my %df; my $flag_page_inside = 0; my $page; open(F, ">:utf8", "./tmp-removeme.txt") or die "can't open a tmpfile"; while (<>) { my $line = $_; if ($flag_page_inside == 0 and $line =~ m{<page>}) { $flag_page_inside = 1; } if ($flag_page_inside == 1) { $page .= $line; } if ($flag_page_inside == 1 and $line =~ m{</page>}) { my ($title) = $page =~ m{<title>([^<]+)<}; $title = word_ok($title); if ($title ne "" and not defined $title_word{$title}) { $title_word{$title} = 1; $df{$title}++; my %relwords; while ($page =~ m{\[\[(.+?)\]\]}g) { my $w = word_ok($1); next if $w eq ""; next if $w eq $title; next if defined $relwords{$w}; $relwords{$w}++; $df{$w}++; print F "$title\t$w\n"; print F "$w\t$title\n"; } } $flag_page_inside = 0; $page = ""; } } close F; sub word_ok { my ($w) = @_; return "" unless $w; return "" if $w =~ m{[:\|\n\t\[\]]};; return "" if $w =~ m{^[\#\s]};; $w =~ s/\s*\(.+?\)$//; return "" if $w =~ /一覧$/; return "" if $w =~ /\d+月\d+日$/; return "" if $w =~ /^\d{4}年$/; return $w; } my @list; my $preword = ""; open(F, "-|:utf8", "sort ./tmp-removeme.txt") or die "can't open a tmpfile"; while (<F>) { chomp; my @c = split("\t", $_); next if (@c != 2); if ($preword ne $c[0]) { calc_rel($preword, \@list); $preword = $c[0]; @list = (); } next unless defined $title_word{$c[0]}; next unless defined $title_word{$c[1]}; push @list, $c[1]; } close F; calc_rel($preword, \@list) if @list > 0; sub calc_rel { my ($title, $lp) = @_; return if @$lp == 0; my %count; foreach my $w (@$lp) { $count{$w}++; } my @rws = map {"$_:$df{$_}"} sort { $df{$a}/(($count{$a}-1)*1000000+1) <=> $df{$b}/(($count{$b}-1)*1000000+1) } keys %count; @rws = @rws[0..20] if @rws > 20; print "$title\t", join("\t", @rws), "\n"; }