#!/usr/bin/php
// Copyright (c) 2008 Appalachian State University
// Licensed under the MIT License (see below)
//
// Problem: We have somewhere around a bazillion websites, each of which has its
// own webalizer configuration and history files. For the Annual
// Report, the boss wants a report of our top sites by page views and
// by visits. There are some awesome contributed scripts, but I
// couldn't find one that exactly matches our problem (at least not one
// that works with stuff other than some HP web applicance).
//
// Solution: ./summalizer.sh
//
// For help, try ./summalizer.sh --help
//
// This script relies on the 'webalizer.hist' files that your stats directories
// contain, and assumes you have a bunch of separate config files that provide
// these. If you only have one config file and one webalizer.hist, this script
// is overkill.
//
// NOTE: This is a SCRIPT intended to be run from the COMMAND LINE. I don't
// know what will happen if you put it in a web-readable directory. More than
// likely it just wouldn't work, but I'm not responsible if it messes something
// up.
//
// If you make any improvements, I'd love to hear about them.
//
//
// Here's a shameless plug: This is a small script, but our pride and joy is:
// phpWebSite Content Management System (http://phpwebsite.appstate.edu)
//
//
// The MIT License
//
// Copyright (c) 2008 Appalachian State University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
/*********
* SCRIPT * You can edit below here, but it might not work afterwards...
*********/
define('WEBALIZER_OUTPUTDIR', 'OutputDir');
define('WEBALIZER_HOSTNAME', 'HostName');
define('WEBALIZER_HISTFILE', '/webalizer.hist');
define('O_YEAR', 0);
define('O_MONTH', 1);
define('O_HITS', 2);
define('O_FILES', 3);
define('O_SITES', 4);
define('O_KBYTES', 5);
define('O_PAGES', 8);
define('O_VISITS', 9);
define('PARSE_COUNT', 10);
// Grand Totals
$t_hits = $t_files = $t_sites = $t_kbytes = $t_pages = $t_visits = 0;
main();
exit();
/**
* Parse Webalizer Configuration File
*
* @param $path string Path to the config file
* @return array|false 'host' => hostname, 'dir' => outputdir
*/
function parse_webalizer_config($path)
{
// Momma always told me, declare your variables at the top
$dir = NULL;
$host = NULL;
$fp = fopen($path, 'r');
if($fp === FALSE) return FALSE;
// Scan the file for either
while(!feof($fp)) {
$parts = explode(' ', fgets($fp),2);
if(count($parts) != 2) continue;
if(trim($parts[0]) == WEBALIZER_OUTPUTDIR) {
$dir = trim($parts[1]);
}
if(trim($parts[0]) == WEBALIZER_HOSTNAME) {
$host = trim($parts[1]);
}
if(!is_null($dir) && !is_null($host))
return array('host' => $host, 'dir' => $dir);
}
return FALSE;
}
/**
* Parse Webalizer History File
*
* @param $path string Path to the History file
* @return array|false Totals from the history file
*/
function parse_webalizer_history($path, $minmonth, $minyear, $maxmonth, $maxyear)
{
$hits = 0;
$files = 0;
$sites = 0;
$kbytes = 0;
$pages = 0;
$visits = 0;
global $t_hits, $t_files, $t_sites, $t_kbytes, $t_pages, $t_visits;
$fp = fopen($path, 'r');
if($fp === FALSE) return FALSE;
// Scan the file, adding to totals as we go
while(!feof($fp)) {
$line = trim(fgets($fp));
if(empty($line)) continue;
$result = preg_split('/ /', $line);
if(count($result) != PARSE_COUNT) {
fwrite(STDERR, "Error parsing history file $path, skipping...\n");
return FALSE;
}
if($result[O_YEAR] <= $minyear && $result[O_MONTH] < $minmonth) {
continue;
}
if($result[O_YEAR] >= $maxyear && $result[O_MONTH] > $maxmonth) {
continue;
}
$hits += $result[O_HITS];
$files += $result[O_FILES];
$sites += $result[O_SITES];
$kbytes += $result[O_KBYTES];
$pages += $result[O_PAGES];
$visits += $result[O_VISITS];
}
$t_hits += $hits;
$t_files += $files;
$t_sites += $sites;
$t_kbytes += $kbytes;
$t_pages += $pages;
$t_visits += $visits;
return array('hits' => $hits,
'files' => $files,
'sites' => $sites,
'kbytes' => $kbytes,
'pages' => $pages,
'visits' => $visits);
}
function show_hosts($hosts, $limit = 0, $exclude = null, $strip = '', $html = FALSE)
{
if($limit == 0) $limit = count($host);
$count = 0;
foreach($hosts as $host) {
$h = $host['host'];
if(!empty($strip))
$h = ereg_replace($strip, '', $h);
$pages = $host['pages'];
$visits = $host['visits'];
if(!empty($exclude)) {
if(in_array($h, $exclude)) {
continue;
}
}
if($html) {
echo "
| $h | $visits | $pages |
\n";
} else {
echo "$h $visits $pages\n";
}
if(++$count == $limit) break;
}
if(!$html) {
echo "\n";
}
}
function display_console($hosts, $limit = 0, $exclude = null, $strip = '')
{
global $t_hits, $t_files, $t_sites, $t_kbytes, $t_pages, $t_visits;
usort($hosts, 'compare_hosts_visits');
echo "Top $limit Hosts By Visits: (host visits pages)\n";
show_hosts($hosts, $limit, $exclude, $strip);
usort($hosts, 'compare_hosts_pages');
echo "Top $limit Hosts By Pages: (host visits pages)\n";
show_hosts($hosts, $limit, $exclude, $strip);
echo "TOTALS:\n";
echo "Hits:\t$t_hits\n";
echo "Files:\t$t_files\n";
echo "Sites:\t$t_sites\n";
echo "KBytes:\t$t_kbytes\n";
echo "Pages:\t$t_pages\n";
echo "Visits:\t$t_visits\n";
echo "\n";
}
function display_html($hosts, $limit = 0, $exclude = null, $strip = '')
{
global $t_hits, $t_files, $t_sites, $t_kbytes, $t_pages, $t_visits;
echo "| Top $limit Hosts by Visits |
\n";
echo "| Host | Visits | Pages |
\n";
usort($hosts, 'compare_hosts_visits');
show_hosts($hosts, $limit, $exclude, $strip, TRUE);
echo "
\n";
echo "| Top $limit Hosts by Pages |
\n";
echo "| Host | Visits | Pages |
\n";
usort($hosts, 'compare_hosts_pages');
show_hosts($hosts, $limit, $exclude, $strip, TRUE);
echo "
\n";
echo "| Totals |
\n";
echo "| Hits | $t_hits |
\n";
echo "| Files | $t_files |
\n";
echo "| Sites | $t_sites |
\n";
echo "| KBytes | $t_kbytes |
\n";
echo "| Pages | $t_pages |
\n";
echo "| Visits | $t_visits |
\n";
echo "
\n";
}
/**
* Print usage info and exit
*/
function usage()
{
fwrite(STDERR, "Usage: summalizer.php [-n] [-s STRING] [-e HOST]... FILE [FILE...]\n");
fwrite(STDERR, "FILE is a Webalizer configuration file. One must be provided,\n");
fwrite(STDERR, "more can be provided. This script is really only useful if you\n");
fwrite(STDERR, "have more than one, though.\n");
fwrite(STDERR, "\n");
fwrite(STDERR, " -n NUM Display top NUM results, 0 for ALL results (default 0)\n");
fwrite(STDERR, " -e HOST Exclude a given hostname from the results\n");
fwrite(STDERR, " Note: Won't be listed, but WILL be in totals\n");
fwrite(STDERR, " -s STRING Portion of hostname to be stripped from the report\n");
fwrite(STDERR, " --html Generate an HTML table\n");
fwrite(STDERR, "\n");
fwrite(STDERR, "EXAMPLE - Full statistics from all configs in HTML, and strip '.com':\n");
fwrite(STDERR, " summalizer.php --html -s .com /etc/webalizer/configs/*\n");
fwrite(STDERR, "\n");
fwrite(STDERR, "EXAMPLE - Top 20 sites, excluding 'example.com' and 'myhost':\n");
fwrite(STDERR, " summalizer.php -n 20 -e example.com -e myhost *.conf\n");
exit(1);
}
/**
* Main Routine
*
* @return int Return value of the script
*/
function main()
{
global $argc;
global $argv;
global $t_hits, $t_files, $t_sites, $t_kbytes, $t_pages, $t_visits;
$minmonth = 6;
$minyear = 2007;
$maxmonth = 5;
$maxyear = 2008;
$hosts = array();
$limit = 0;
$html = FALSE;
$exclude = array();
$strip = '';
$is_exclude = FALSE;
$is_strip = FALSE;
// Gather up the configurations
array_shift($argv);
foreach($argv as $config) {
if($limit == -1) {
$limit = (int)$config;
if(!is_int($limit)) {
fwrite(STDERR, "Integer must be specified with -n\n");
usage();
}
continue;
}
if($is_exclude) {
$exclude[] = $config;
$is_exclude = FALSE;
continue;
}
if($is_strip) {
$strip = $config;
$is_strip = FALSE;
continue;
}
if($config == '--help')
usage(); // --help anywhere means give help and exit
if($config == '-n') {
$limit = -1;
continue;
}
if($config == '--html') {
$html = TRUE;
continue;
}
if($config == '-e') {
$is_exclude = TRUE;
continue;
}
if($config == '-s') {
$is_strip = TRUE;
continue;
}
if(!file_exists($config)) {
fwrite(STDERR, "$config does not exist, skipping...\n");
continue;
}
$result = parse_webalizer_config($config);
if($result === FALSE) {
fwrite(STDERR, "Coult not parse $config, skipping...\n");
continue;
}
$totals = parse_webalizer_history($result['dir'] . WEBALIZER_HISTFILE,
$minmonth, $minyear, $maxmonth, $maxyear);
if($totals === FALSE) {
fwrite(STDERR, "Could not parse " . $result['dir'] . WEBALIZER_HISTFILE . ", skipping...\n");
continue;
}
$totals['host'] = $result['host'];
$hosts[] = $totals;
}
if(empty($hosts)) {
fwrite(STDERR, "Please provide some webalizer configurations.\n");
usage();
}
echo "\n\n";
if($html) {
display_html($hosts, $limit, $exclude, $strip);
} else {
display_console($hosts, $limit, $exclude, $strip);
}
return 0;
}
function compare_hosts_visits($a, $b)
{
// On the off chance they're equal, go with the alphabet
if($a['visits'] == $b['visits'])
return strcmp($a['host'], $b['host']);
// This looks weird because we're sorting in DESCENDING order
return ($a['visits'] < $b['visits']) ? 1 : -1;
}
function compare_hosts_pages($a, $b)
{
// On the off chance they're equal, go with the alphabet
if($a['pages'] == $b['pages'])
return strcmp($a['host'], $b['host']);
// This looks weird because we're sorting in DESCENDING order
return ($a['pages'] < $b['pages']) ? 1 : -1;
}
?>