// Internet Duct Tape

Perl Script – WordPress.com 7 Day Referrer Parser

More information about this script and a link to download it directly.

parse.pl


############################################################################
#
# WordPress.com 7 Day Referrer Log Parser by https://engtech.wordpress.com
#
# Thanks for your interest in this script, but I have to warn you that it
# isn't intended for general usage or that well supported. I'm offering it
# as a "Hey, this might be useful to you if you already know Perl/unix".
#
# How to run
#
# 1. Login to your wordpress.com blog and download the 7 Day Referrer page
#    to a file. This script might only work if the page is downloaded with
#    Firefox. I've also only tried having the downloaded page in the same
#    directory as the script.
#
# 2. Run the script giving the name of the downloaded page as the first
#    argument:
#             perl parse.pl week1.html
#
# 3. The first time the script is run with a new web page it will create
#    a CSV file. This file can be editted with a text editor or with
#    Microsoft Excel. When you first edit the file both columns will have
#    the same information. What you want to do is edit the second columns
#    and create "groups":
#                        board.progaming.it,other articles
#                        chris.pirillo.com,chris.pirillo.com
#                        coolthingoftheday.blogspot.com,long tail
#
# 4. Re-run the script and it will create an HTML file (IE: week1_out.html)
#    with all of the URLs from the 7 Day Referrers page grouped as you 
#    defined in the CSV file.
#    It will also create a second CSV file called "week1_graph.csv" that is
#    a list of the groups that are used for the first generated graph.
#    Any group that is not listed in this CSV file will be put on the second
#    generated graph.
#
# 5. The process I use is:
#    - Run the script to generate the CSV file
#    - Look at the output and create groups where it makes sense
#    - Keep re-running the script and editting the CSV until I like way the
#      HTML output is organized.
#    - Split the graphs the way I want to.
#
# 6. I'm sorry the graphs are so friggin' ugly. I need to read a tutorial
#    on "How to not make GD::Graph output shit."
#
############################################################################

use strict;
use warnings;
use Data::Dumper;
use GD::Graph::lines;
use FileHandle;

my $has_csv = 0;
my $has_csv_graph = 0;
my @data = ();
my $data_idx = -1;
my %sites = ();
my %urls = ();
my %translate = ();
my %totalsFromSite = ();

my $file = $ARGV[0];

if (! -f $file) {
   die "could not find file '$file': $!";
}

my $prefix = $file;
$prefix =~ s/\..*$//;
my $csv_file = $prefix . ".csv";
my $csv_graph_file = $prefix . "_graph.csv";
my @sites_high = ();
my $graph_high = $prefix."_high.png";
my $graph_low = $prefix."_low.png";

readCSV();
input();
if (not $has_csv) { writeCSV(); }
output();
if ($has_csv && not $has_csv_graph) { writeCSVGraph(); }
if ($has_csv && $has_csv_graph) {
   generateGraph();
}
exit(0);

sub readCSV {
   if (-r $csv_file) {
      $has_csv = 1;
      my $ifh = new FileHandle($csv_file, "r") || die "could not read file '$csv_file': $!";
      while(<$ifh>) {
         chomp;
         my @data = split(/,/, $_);
         $translate{$data[0]} = $data[1];
      }
      close($ifh);
#      print Dumper(%translate);
   }
   if (-r $csv_graph_file) {
      $has_csv_graph = 1;
      my $ifh = new FileHandle($csv_graph_file, "r") || die "could not read file '$csv_graph_file': $!";
      while(<$ifh>) {
         chomp;
         @sites_high = split(/,/, $_);
      }
      close($ifh);
#      print Dumper(@sites_high);
   }
}

sub writeCSV {
   my $ofh = new FileHandle($csv_file, "w") || die "could not write file '$csv_file': $!";
   foreach my $s (sort keys %sites) {
      print $ofh "$s,$s\n";
   }
   close ($ofh);
   print "Please edit $csv_file to create groups and rerun.\n";
}

sub writeCSVGraph {
   my $text = "";
   foreach my $s (sort keys %sites) {
      chomp($s);
      $text .= ",$s";
   }
   $text =~ s/^,//;
   my $ofh = new FileHandle($csv_graph_file, "w") || die "could not write file '$csv_graph_file': $!";
   print $ofh $text . "\n";
   close ($ofh);
   print "Please edit $csv_graph_file to break groups into high and low for the graphs.\n";
}


sub input {
   my $ifh = new FileHandle($file, "r") || die "could not read file '$file': $!";
   while (<$ifh>) {
      #<table class="statsDay">
      if (m/<table class="statsDay">/) {
         my %hash = ();
         push(@data, \%hash);
         $data_idx++;
         #print "New day\n";
      }
      #<tr class="alternate"><td><a href="http://digg.com/view/all/popular/today/page3">digg.com/view/all/popular/today/page3</a></td><td class="views">7</td></tr>
      if (m/<td><a href="(.*?)">(.*?)(\/.*|)<\/a><\/td><td class="views">(\d+)<\/td><\/tr>/) {
         my ($url, $site, $hits) = ($1, $2, $4);
         process($url, $site, $hits);
      }
      # <tr><td>engtech.wordpress.com/tag/nokia-6682</td><td class="views">4</td></tr>
      elsif (m/<td>(.*?)(\/.*|)<\/td><td class="views">(\d+)<\/td><\/tr>/) {
         my ($url, $site, $hits) = ("$1$2", $1, $3);
         process($url, $site, $hits);
      }
   }
   close($ifh);
}

sub process {
   my ($url, $site, $hits) = @_;
   #print "$hits, $url\n";
   my $ref = $data[$data_idx];
   # Normalize site urls
   if (defined $translate{$site}) {
      $site = $translate{$site};
   }
   $sites{$site} = $data_idx;      # newest to oldest
   # Keep track of URLs per site
   if (not defined $urls{$site}) {
      my %hash = ();
      $urls{$site} = \%hash;
   }
   $urls{$site}{$url} = 1;
   # Keep count
   if (not defined $ref->{$site}) {
      $ref->{$site} = 0;
   }
   $ref->{$site} += $hits;
}

sub output {
   delete($sites{'REMOVE'});
   delete($urls{'REMOVE'});

   my $ofile = $prefix."_out.html";
   my $ofh = new FileHandle($ofile, "w") || die "could not write '$ofile': $!";
   print $ofh "<TABLE BORDER=\"1\" CELLPADDING=\"5\" CELLSPACING=\"5\" WIDTH=\"100%\">\n";
   print $ofh "<TR><TH>Site</TH>";
   for(my $i=$#data; $i>=0; $i--) {
      my $day = $#data - $i + 1;
      print $ofh "<TH>Day $day</TH>";
   }
   print $ofh "<TH>Totals</TH></TR>\n";

   my @text = ();
   my @totals = ();
   foreach my $site (sort {sortSites($a, $b)} keys %sites) {
      push(@text, "<tr><td>$site</td>");
      push(@totals, 0);
   }

   for(my $i=$#data; $i>=0; $i--) {
      my $index = 0;
      foreach my $site (sort {sortSites($a, $b)} keys %sites) {
         # Normalize
         if (not defined $data[$i]->{$site}) {
            $data[$i]->{$site} = 0;
         }
         my $value = $data[$i]->{$site};
         $totals[$index] += $value;
         $text[$index] .= "<TD>".$value."</TD>";
         $totalsFromSite{$site} = $totals[$index];
         $index++;
      }
   }
   for(my $i=0; $i<=$#text; $i++) {
      print $ofh $text[$i] . "</TD><TD>" . $totals[$i] . "</TD></TR>\n";
   }
   print $ofh "</TABLE>\n";

   print $ofh "<TABLE BORDER=\"1\" CELLPADDING=\"5\" CELLSPACING=\"5\" WIDTH=\"100%\">\n";

   foreach my $site (sort {sortSites($a, $b)} keys %sites) {
      my $total = $totalsFromSite{$site};
      my $url_text = "<UL>";
      foreach my $url (sort keys %{$urls{$site}}) {
         $url_text .= "<LI><A HREF=\"$url\">$url</A></LI>";
      }
      $url_text .= "</UL>";
      print $ofh "<TR><TD>$site</TD><TD>$total</TD><TD>$url_text</TD></TR>";
   }

   print $ofh "</TABLE>\n";

   close($ofh);

}

sub sortSites {
   my ($a, $b) = @_;
   my $vala = $sites{$a};
   my $valb = $sites{$b};
   if ($vala == $valb) {
      return($a cmp $b);
   }
   else {
      return($valb <=> $vala);
   }
}

sub generateGraph {
   my @graph_high = ();
   my @graph_low = ();
   my $max_high = 0;
   my $max_low = 0;


   my $day = 0;
   foreach my $ref (reverse @data) {
      my @row_high = ();
      push(@row_high, $day);
      foreach my $site (@sites_high) {
         my $value = $ref->{$site};
         if ($value > $max_high) {
            $max_high = $value;
         }
         push(@row_high, $value);
         delete($sites{$site});
      }
      push(@graph_high, \@row_high);
      $day++;
   }
   image($graph_high, $max_high, \@graph_high, \@sites_high);

   $day = 0;
   foreach my $ref (reverse @data) {
      my @row_low = ();
      push(@row_low, $day);
      foreach my $site (sort {sortSites($a, $b)} keys %sites) {
         my $value = $ref->{$site};
         if ($value > $max_low) {
            $max_low = $value;
         }
         push(@row_low, $value);
      }
      push(@graph_low, \@row_low);
      $day++;
   }

   my @legend = ();
   foreach my $site (sort {sortSites($a, $b)} keys %sites) {
      push(@legend, $site);
   }

   image($graph_low, $max_low, \@graph_low, \@legend);
}

sub image {
   my ($file, $max, $gref, $lref) = @_;
   my @graph = @{$gref};
   my @legend = @{$lref};

   my $gdata = GD::Graph::Data->new();
   foreach my $row (@graph) {
      $gdata->add_point(@{$row});
   }
   my @colours = ("black", "blue", "purple", "green", "red", "gray", "dgray");

   my $chart = GD::Graph::lines->new(600,375);
   $chart->set_legend(@legend);
   $chart -> set_x_axis_font("/usr/X11R6/lib/X11/fonts/TTF/lusimbi.ttf", 10);
   $chart -> set_y_axis_font("/usr/X11R6/lib/X11/fonts/TTF/luximbi.ttf", 10);
   $chart -> set_x_label_font("/usr/X11R6/lib/X11/fonts/TTF/luximb.ttf", 12);
   $chart -> set_y_label_font("/usr/X11R6/lib/X11/fonts/TTF/luximb.ttf", 12);
   $chart -> set_legend_font("/usr/X11R6/lib/X11/fonts/TTF/luximbi.ttf", 10);
   $chart->set
     (
      y_label             => "Traffic",
      x_label             => "Days",
      y_max_value         => $max,
      line_width          => 3,
      y_long_ticks        => 1,
      dclrs               => [@colours]
     );
   open(IMAGE, ">$file") or
     die "Cannot open $file output png file for writing: $!";
   print IMAGE $chart->plot($gdata)->png;
   close IMAGE;
}

Generated using PerlTidy (-html -nss)

%d bloggers like this: