#!/usr/bin/perl use strict; use warnings; use Encode; use Time::Local; use POSIX qw(strftime); $ENV{'TZ'} = "JST-9"; my $idx = 0; my @month = ('Jan','Feb','Mar','Apr','May','Jun', 'Jul','Aug','Sep','Oct','Nov','Dec'); my %m2i = map {$_ => $idx++} @month; my $count; my $sum_byte; my $sum_char; my $sum_json; my %hourly; while (<>) { my $line = $_; s/^\{(.*)\}/$1/; while (s/:\{[^\}]+\}/:<>/g) {} my ($text) = /"text":"(.+?)"/; next unless $text; my ($tm) = /"created_at":"(.+?)"/; next unless $tm =~ /([A-Z][a-z]{2}) (\d+) (\d+):(\d+):(\d+) .\d+ (\d+)$/; my $utm = timegm($5, $4, $3, $2, $m2i{$1}, $6); $text =~ s/\\u([0-9A-Z]{4})/chr(hex($1))/gei; $sum_byte += length(encode('utf8', $text)); $sum_char += length($text); $sum_json += length($line); $hourly{strftime("%Y%m%d%H", localtime($utm))}++; $count++; } print "ave bytes : ", $sum_byte / $count, "\n"; print "ave chars : ", $sum_char / $count, "\n"; print "ave jsons : ", $sum_json / $count, "\n"; foreach my $i (sort keys %hourly) { print "$i $hourly{$i}\n"; }
curl -uID:PW http://stream.twitter.com/1/statuses/sample.json | gzip -c > tw.gz zcat tw.gz | grep '"lang":"ja"' | ./twistat.pl 2> error.log