#!/bin/sh
#
# geoupdate - update geocrawler archive index for this month's archive
#	- indexes month from scratch and merges it in to master database
#
# Should be run from cron before noon GMT (before 4 AM PST) for proper
# date handling (see below)
#

htdiggrp=/home/groups/h/ht/htdig
htdigdir=$htdiggrp/htdig
bindir=$htdigdir/bin
dbdir=$htdigdir/db
scriptdir=$htdiggrp/scripts

# try to use yesterday's date, so on first of month we index all last month's
# archive
dates=`TZ=GMT+12 date "+%Y/%m" | sed -e 's|/0|/|'`

umask 2
mkdir -p $dbdir/geoupdate

cat > $htdigdir/geoupdate.conf <<!
# rewritten by $0 when it runs...
include:	$htdigdir/htdig.conf
database_dir:	$dbdir/geoupdate
allow_numbers:	true
use_doc_date:	true
external_parsers:	text/html->text/html-internal $scriptdir/ungeoify.sh
limit_urls_to:	http://www.geocrawler.com/archives/3/8822/$dates/ http://www.geocrawler.com/archives/3/8825/$dates/
start_url:	http://www.geocrawler.com/archives/3/8822/$dates/0/ http://www.geocrawler.com/archives/3/8825/$dates/0/
!

$bindir/htdig -i -c $htdigdir/geoupdate.conf
$bindir/htmerge -c $htdigdir/geoupdate.conf
$bindir/htmerge -a -c $htdigdir/htdig.conf -m $htdigdir/geoupdate.conf
(cd $dbdir && mv db.words.db.work db.words.db &&
	mv db.docs.index.work db.docs.index && cp -p db.docdb.work db.docdb)
