#!/usr/bin/php // Copyright (c) 2008 Appalachian State University // Licensed under the MIT License (see below) // // Problem: We have somewhere around a bazillion websites, each of which has its // own webalizer configuration and history files. For the Annual // Report, the boss wants a report of our top sites by page views and // by visits. There are some awesome contributed scripts, but I // couldn't find one that exactly matches our problem (at least not one // that works with stuff other than some HP web applicance). // // Solution: ./summalizer.sh // // For help, try ./summalizer.sh --help // // This script relies on the 'webalizer.hist' files that your stats directories // contain, and assumes you have a bunch of separate config files that provide // these. If you only have one config file and one webalizer.hist, this script // is overkill. // // NOTE: This is a SCRIPT intended to be run from the COMMAND LINE. I don't // know what will happen if you put it in a web-readable directory. More than // likely it just wouldn't work, but I'm not responsible if it messes something // up. // // If you make any improvements, I'd love to hear about them. // // // Here's a shameless plug: This is a small script, but our pride and joy is: // phpWebSite Content Management System (http://phpwebsite.appstate.edu) // // // The MIT License // // Copyright (c) 2008 Appalachian State University // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. /********* * SCRIPT * You can edit below here, but it might not work afterwards... *********/ define('WEBALIZER_OUTPUTDIR', 'OutputDir'); define('WEBALIZER_HOSTNAME', 'HostName'); define('WEBALIZER_HISTFILE', '/webalizer.hist'); define('O_YEAR', 0); define('O_MONTH', 1); define('O_HITS', 2); define('O_FILES', 3); define('O_SITES', 4); define('O_KBYTES', 5); define('O_PAGES', 8); define('O_VISITS', 9); define('PARSE_COUNT', 10); // Grand Totals $t_hits = $t_files = $t_sites = $t_kbytes = $t_pages = $t_visits = 0; main(); exit(); /** * Parse Webalizer Configuration File * * @param $path string Path to the config file * @return array|false 'host' => hostname, 'dir' => outputdir */ function parse_webalizer_config($path) { // Momma always told me, declare your variables at the top $dir = NULL; $host = NULL; $fp = fopen($path, 'r'); if($fp === FALSE) return FALSE; // Scan the file for either while(!feof($fp)) { $parts = explode(' ', fgets($fp),2); if(count($parts) != 2) continue; if(trim($parts[0]) == WEBALIZER_OUTPUTDIR) { $dir = trim($parts[1]); } if(trim($parts[0]) == WEBALIZER_HOSTNAME) { $host = trim($parts[1]); } if(!is_null($dir) && !is_null($host)) return array('host' => $host, 'dir' => $dir); } return FALSE; } /** * Parse Webalizer History File * * @param $path string Path to the History file * @return array|false Totals from the history file */ function parse_webalizer_history($path, $minmonth, $minyear, $maxmonth, $maxyear) { $hits = 0; $files = 0; $sites = 0; $kbytes = 0; $pages = 0; $visits = 0; global $t_hits, $t_files, $t_sites, $t_kbytes, $t_pages, $t_visits; $fp = fopen($path, 'r'); if($fp === FALSE) return FALSE; // Scan the file, adding to totals as we go while(!feof($fp)) { $line = trim(fgets($fp)); if(empty($line)) continue; $result = preg_split('/ /', $line); if(count($result) != PARSE_COUNT) { fwrite(STDERR, "Error parsing history file $path, skipping...\n"); return FALSE; } if($result[O_YEAR] <= $minyear && $result[O_MONTH] < $minmonth) { continue; } if($result[O_YEAR] >= $maxyear && $result[O_MONTH] > $maxmonth) { continue; } $hits += $result[O_HITS]; $files += $result[O_FILES]; $sites += $result[O_SITES]; $kbytes += $result[O_KBYTES]; $pages += $result[O_PAGES]; $visits += $result[O_VISITS]; } $t_hits += $hits; $t_files += $files; $t_sites += $sites; $t_kbytes += $kbytes; $t_pages += $pages; $t_visits += $visits; return array('hits' => $hits, 'files' => $files, 'sites' => $sites, 'kbytes' => $kbytes, 'pages' => $pages, 'visits' => $visits); } function show_hosts($hosts, $limit = 0, $exclude = null, $strip = '', $html = FALSE) { if($limit == 0) $limit = count($host); $count = 0; foreach($hosts as $host) { $h = $host['host']; if(!empty($strip)) $h = ereg_replace($strip, '', $h); $pages = $host['pages']; $visits = $host['visits']; if(!empty($exclude)) { if(in_array($h, $exclude)) { continue; } } if($html) { echo "$h$visits$pages\n"; } else { echo "$h $visits $pages\n"; } if(++$count == $limit) break; } if(!$html) { echo "\n"; } } function display_console($hosts, $limit = 0, $exclude = null, $strip = '') { global $t_hits, $t_files, $t_sites, $t_kbytes, $t_pages, $t_visits; usort($hosts, 'compare_hosts_visits'); echo "Top $limit Hosts By Visits: (host visits pages)\n"; show_hosts($hosts, $limit, $exclude, $strip); usort($hosts, 'compare_hosts_pages'); echo "Top $limit Hosts By Pages: (host visits pages)\n"; show_hosts($hosts, $limit, $exclude, $strip); echo "TOTALS:\n"; echo "Hits:\t$t_hits\n"; echo "Files:\t$t_files\n"; echo "Sites:\t$t_sites\n"; echo "KBytes:\t$t_kbytes\n"; echo "Pages:\t$t_pages\n"; echo "Visits:\t$t_visits\n"; echo "\n"; } function display_html($hosts, $limit = 0, $exclude = null, $strip = '') { global $t_hits, $t_files, $t_sites, $t_kbytes, $t_pages, $t_visits; echo "\n"; echo "\n"; usort($hosts, 'compare_hosts_visits'); show_hosts($hosts, $limit, $exclude, $strip, TRUE); echo "
Top $limit Hosts by Visits
HostVisitsPages
\n"; echo "\n"; echo "\n"; usort($hosts, 'compare_hosts_pages'); show_hosts($hosts, $limit, $exclude, $strip, TRUE); echo "
Top $limit Hosts by Pages
HostVisitsPages
\n"; echo "\n"; echo "\n"; echo "\n"; echo "\n"; echo "\n"; echo "\n"; echo "\n"; echo "
Totals
Hits$t_hits
Files$t_files
Sites$t_sites
KBytes$t_kbytes
Pages$t_pages
Visits$t_visits
\n"; } /** * Print usage info and exit */ function usage() { fwrite(STDERR, "Usage: summalizer.php [-n] [-s STRING] [-e HOST]... FILE [FILE...]\n"); fwrite(STDERR, "FILE is a Webalizer configuration file. One must be provided,\n"); fwrite(STDERR, "more can be provided. This script is really only useful if you\n"); fwrite(STDERR, "have more than one, though.\n"); fwrite(STDERR, "\n"); fwrite(STDERR, " -n NUM Display top NUM results, 0 for ALL results (default 0)\n"); fwrite(STDERR, " -e HOST Exclude a given hostname from the results\n"); fwrite(STDERR, " Note: Won't be listed, but WILL be in totals\n"); fwrite(STDERR, " -s STRING Portion of hostname to be stripped from the report\n"); fwrite(STDERR, " --html Generate an HTML table\n"); fwrite(STDERR, "\n"); fwrite(STDERR, "EXAMPLE - Full statistics from all configs in HTML, and strip '.com':\n"); fwrite(STDERR, " summalizer.php --html -s .com /etc/webalizer/configs/*\n"); fwrite(STDERR, "\n"); fwrite(STDERR, "EXAMPLE - Top 20 sites, excluding 'example.com' and 'myhost':\n"); fwrite(STDERR, " summalizer.php -n 20 -e example.com -e myhost *.conf\n"); exit(1); } /** * Main Routine * * @return int Return value of the script */ function main() { global $argc; global $argv; global $t_hits, $t_files, $t_sites, $t_kbytes, $t_pages, $t_visits; $minmonth = 6; $minyear = 2007; $maxmonth = 5; $maxyear = 2008; $hosts = array(); $limit = 0; $html = FALSE; $exclude = array(); $strip = ''; $is_exclude = FALSE; $is_strip = FALSE; // Gather up the configurations array_shift($argv); foreach($argv as $config) { if($limit == -1) { $limit = (int)$config; if(!is_int($limit)) { fwrite(STDERR, "Integer must be specified with -n\n"); usage(); } continue; } if($is_exclude) { $exclude[] = $config; $is_exclude = FALSE; continue; } if($is_strip) { $strip = $config; $is_strip = FALSE; continue; } if($config == '--help') usage(); // --help anywhere means give help and exit if($config == '-n') { $limit = -1; continue; } if($config == '--html') { $html = TRUE; continue; } if($config == '-e') { $is_exclude = TRUE; continue; } if($config == '-s') { $is_strip = TRUE; continue; } if(!file_exists($config)) { fwrite(STDERR, "$config does not exist, skipping...\n"); continue; } $result = parse_webalizer_config($config); if($result === FALSE) { fwrite(STDERR, "Coult not parse $config, skipping...\n"); continue; } $totals = parse_webalizer_history($result['dir'] . WEBALIZER_HISTFILE, $minmonth, $minyear, $maxmonth, $maxyear); if($totals === FALSE) { fwrite(STDERR, "Could not parse " . $result['dir'] . WEBALIZER_HISTFILE . ", skipping...\n"); continue; } $totals['host'] = $result['host']; $hosts[] = $totals; } if(empty($hosts)) { fwrite(STDERR, "Please provide some webalizer configurations.\n"); usage(); } echo "\n\n"; if($html) { display_html($hosts, $limit, $exclude, $strip); } else { display_console($hosts, $limit, $exclude, $strip); } return 0; } function compare_hosts_visits($a, $b) { // On the off chance they're equal, go with the alphabet if($a['visits'] == $b['visits']) return strcmp($a['host'], $b['host']); // This looks weird because we're sorting in DESCENDING order return ($a['visits'] < $b['visits']) ? 1 : -1; } function compare_hosts_pages($a, $b) { // On the off chance they're equal, go with the alphabet if($a['pages'] == $b['pages']) return strcmp($a['host'], $b['host']); // This looks weird because we're sorting in DESCENDING order return ($a['pages'] < $b['pages']) ? 1 : -1; } ?>