==> a1.txt <== a b c d e f a ==> a2.txt <== a e b c d d a f b ==> a3.txt <== b c c a e f
#!/usr/bin/env perl
use strict;
use warnings;
my %link;
while (<>) {
chomp;
my ($key, @c) = split(/\t/, $_);
foreach my $i (@c) {
$link{$key}{$i} = 1;
}
}
foreach my $key (sort keys %link) {
print join("\t", $key, sort keys %{$link{$key}}), "\n";
}
perl -nle '($k,@c)=split/\t/;for(@c){$h{$k}{$_}=1};
END{for(sort keys%h){print join("\t",$_,sort keys%{$h{$_}})}}
' a1.txt a2.txt a3.txt
a b c e b c d c a d a e e f f a b
#!/usr/bin/env perl
use strict;
use warnings;
my @fns = @ARGV;
my @fhs;
my @lis;
for (my $i = 0; $i < @fns; $i++) {
open($fhs[$i], "<", $fns[$i]) or die;
$lis[$i] = read_oneline($fhs[$i]);
}
while (1) {
my @ixs = sort {$lis[$a]{key} cmp $lis[$b]{key}} 0..$#lis;
my $ci = 0;
for ($ci = 0; $ci < @ixs; $ci++) {
last if $lis[$ixs[$ci]]{key} ne "";
}
last if $ci == @lis;
if (($ci == $#lis) or ($lis[$ixs[$ci]]{key} ne $lis[$ixs[$ci+1]]{key})) {
print join("\t", $lis[$ixs[$ci]]{key}, @{$lis[$ixs[$ci]]{cont}})."\n";
} else {
my %h;
foreach my $i (@{$lis[$ixs[$ci]]{cont}}, @{$lis[$ixs[$ci+1]]{cont}}) {
$h{$i} = 1;
}
$lis[$ixs[$ci+1]]{cont} = [sort keys %h];
}
$lis[$ixs[$ci]] = read_oneline($fhs[$ixs[$ci]]);
}
for (my $i = 0; $i < @fhs; $i++) {
close($fhs[$i]);
}
sub read_oneline {
my ($fh) = @_;
return {key => ""} if eof($fh);
my $line = <$fh>;
chomp $line;
my ($key, @c) = split(/\t/, $line);
return {key => $key, cont => \@c};
}
% ./merge-large-tsv.pl a1.txt a2.txt a3.txt a b c e b c d c a d a e e f f a b