==> a1.txt <== a b c d e f a ==> a2.txt <== a e b c d d a f b ==> a3.txt <== b c c a e f
#!/usr/bin/env perl use strict; use warnings; my %link; while (<>) { chomp; my ($key, @c) = split(/\t/, $_); foreach my $i (@c) { $link{$key}{$i} = 1; } } foreach my $key (sort keys %link) { print join("\t", $key, sort keys %{$link{$key}}), "\n"; }
perl -nle '($k,@c)=split/\t/;for(@c){$h{$k}{$_}=1}; END{for(sort keys%h){print join("\t",$_,sort keys%{$h{$_}})}} ' a1.txt a2.txt a3.txt
a b c e b c d c a d a e e f f a b
#!/usr/bin/env perl use strict; use warnings; my @fns = @ARGV; my @fhs; my @lis; for (my $i = 0; $i < @fns; $i++) { open($fhs[$i], "<", $fns[$i]) or die; $lis[$i] = read_oneline($fhs[$i]); } while (1) { my @ixs = sort {$lis[$a]{key} cmp $lis[$b]{key}} 0..$#lis; my $ci = 0; for ($ci = 0; $ci < @ixs; $ci++) { last if $lis[$ixs[$ci]]{key} ne ""; } last if $ci == @lis; if (($ci == $#lis) or ($lis[$ixs[$ci]]{key} ne $lis[$ixs[$ci+1]]{key})) { print join("\t", $lis[$ixs[$ci]]{key}, @{$lis[$ixs[$ci]]{cont}})."\n"; } else { my %h; foreach my $i (@{$lis[$ixs[$ci]]{cont}}, @{$lis[$ixs[$ci+1]]{cont}}) { $h{$i} = 1; } $lis[$ixs[$ci+1]]{cont} = [sort keys %h]; } $lis[$ixs[$ci]] = read_oneline($fhs[$ixs[$ci]]); } for (my $i = 0; $i < @fhs; $i++) { close($fhs[$i]); } sub read_oneline { my ($fh) = @_; return {key => ""} if eof($fh); my $line = <$fh>; chomp $line; my ($key, @c) = split(/\t/, $line); return {key => $key, cont => \@c}; }
% ./merge-large-tsv.pl a1.txt a2.txt a3.txt a b c e b c d c a d a e e f f a b