#!/usr/bin/perl
#
#	map_site.pl
#	from map_site-0.0.2-prototype
#	(http://www.tatanka.com/software/product/map_site/index.html)
#
#	generate a sitemap for a website
# 
#	Copyright 2012 Michael Marking
# 
#	Michael Marking <marking@tatanka.com>
#
#	Support: No support is promised with this software. However,
#	some support may be provided on a time-available basis. See the
#	SUPPORT file included with the distribution, or contact
#	support@tatanka.com.
#
#	This program is free software: you can redistribute it and/or
#	modify it under the terms of version 3 of the GNU General Public
#	License as published by the Free Software Foundation.
#
#	This program is distributed in the hope that it will be useful,
#	but WITHOUT ANY WARRANTY; without even the implied warranty of
#	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#	GNU General Public License version 3 for more details.
#
#	You should have received a copy of version 3 of the GNU General
#	Public License along with this program. If not, see
#	<http://www.gnu.org/licenses/>.
#

$program_name = "map_site" ;
$program_version = "0.0.2" ;

print "$program_name $program_version\n" ;
$name_and_version_printed = 1 ;

#
#	Set default option values.
#

$verbose = "" ;
$quiet = "" ;
$force = "" ;

$EXIT_SUCCESS = 0 ;
$EXIT_FAILURE = 1 ;

# configuration; someday these might be set or altered from the command
# line, or from a configuration file, but for now...

@suffixes_allowed = ( "html", "htm", "xml", "xhtml",
  "txt", "pdf", "ps",
  "tar", "gz", "zip", "z",
  "asc", "sig",
  "deb", "rpm",
  "pl", "c", "sh", "h", "cxx", "idl", "m4", "mk" ) ;

# determine the run date and time

( $run_sec, $run_min, $run_hour, $run_mday, $run_mon, $run_year,
  $run_wday, $run_yday, $run_isdst) = gmtime ( time ) ;

#
#	Loop through command line arguments.
#

$website_root_directory = "" ;
$website_url = "" ;

while ( @ARGV )
	{
	$command_line_argument = shift @ARGV ;

	if ( length ( $command_line_argument ) eq 0 )
		{
		print "$program_name: Error: zero-length "
			. "command line argument; try "
			. "$program_name --help.\n" ;
		exit $EXIT_FAILURE ;
		}

	if ( $command_line_argument eq "--version" )
		{
		if ( ! $name_and_version_printed )
			{
			print "$program_name $program_version\n" ;
			}
		exit $EXIT_SUCCESS ;
		}

	if ( $command_line_argument eq "--help" )
		{

print "Usage: map_site [--exclude pattern] website_url [website_rootdir]\n" ;
print "    website_rootdir directory defaults to current directory\n" ;
print "    --exclude argument may be repeated; excludes files\n" ;
print "        with names matching specified pattern\n" ;
print "    map_site --help\n" ;
print "Description: Creates a sitemap.xml file for a website.\n" ;
print "    Will include .htm, .html, .gz, .tar, .zip, .pdf, .txt,\n" ;
print "    and other files which are not graphics (.png, .jpg, etc).\n" ;
print "    Intended to be run on webserver, with .htdocs as site\n" ;
print "    root directory.\n" ;
print "    If installed properly, you can enter 'man map_site' for details.\n" ;
print "License: Copyright 2012 Michael Marking <marking\@tatanka.com>.\n" ;
print "    This is free software, released under GNU General Public\n" ;
print "    License version 3. This program comes with absolutely\n" ;
print "    NO WARRANTY, without even the implied warranties of\n" ;
print "    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n" ;

		exit $EXIT_SUCCESS ;
		}

	if ( $command_line_argument eq "--exclude" )
		{
		if ( @ARGV eq 0 )
			{
			print "$program_name: Error: missing --exclude "
				. "pattern; try "
				. "$program_name --help.\n" ;
			exit $EXIT_FAILURE ;
			}
		$exclude_pattern = shift @ARGV ;
		if ( $exclude_pattern =~ m/^\-/ )
			{
			print "$program_name: Error: missing or bad "
				. "--exclude pattern ($exclude_pattern?); try "
				. "$program_name --help.\n" ;
			exit $EXIT_FAILURE ;
			}
		push @exclude_pattern_list, $exclude_pattern ;
		next ;
		}

	if ( substr ( $command_line_argument, 0, 1 ) eq "-" )
		{
		print "$program_name: Error: unrecognized option "
			. "$command_line_argument; try "
			. "$program_name --help.\n" ;
		exit $EXIT_FAILURE ;
		}

	# the command line argument doesn't begin with a "-", so it's
	# not an option; ought to be the designation of the web site
	# root directory or the website url

	if ( $website_url eq "" )
		{
		# no website url yet specified: this must be it
		# we'll do only basic syntax checking
		if ( $command_line_argument !~ m(https{0,1}://) )
			{
			print "$program_name: Error: $command_line_argument "
			  . "doesn't look like a proper URL\n" ;
			exit $EXIT_FAILURE ;
			}
		$website_url = $command_line_argument ;
		# make sure there's a slash at the end
		if ( substr ( $website_url, -1, 1 ) ne "/" )
			{
			$website_url = $website_url . "/" ;
			}
		next ;
		}
	# it's not the url, must be the root directory
	if ( not -e $command_line_argument )
		{
		print "$program_name: Error: supposed web site "
			. "root directory $command_line_argument does not "
			. "exist; try $program_name --help\n" ;
		exit $EXIT_FAILURE ;
		}

	if ( not -d $command_line_argument )
		{
		print "$program_name: Error: supposed web site "
			. "root directory $command_line_argument is not a "
			. "directory; try $program_name --help\n" ;
		exit $EXIT_FAILURE ;
		}

	# has the caller already specified a web site root directory?

	if ( $website_root_directory ne "" )
		{
		print "$program_name: Error: web site root directory "
			. "already specified as $website_root_directory; "
			. "cannot rename to $command_line_argument; "
			. "try $program_name --help\n" ;
		exit $EXIT_FAILURE ;
		}
	
	$website_root_directory = $command_line_argument ;

	# make sure we end directory path with trailing slash
	if ( substr ( $website_root_directory, -1, 1 ) ne "/" )
		{
		$website_root_directory = $website_root_directory . "/" ;
		}

	} # end while ( @ARGV )

# if a root directory has not been specified, then default to the
# current directory
if ( $website_root_directory eq "" )
	{
	$website_root_directory = "./" ;
	}

print "exclude patterns: @exclude_pattern_list\n" ;
print "root directory: $website_root_directory\n" ;

# backup existing sitemap.xml, if any

$sitemap_path = $website_root_directory . "sitemap.xml" ;

if ( -e $sitemap_path )
	{
	if ( not -f $sitemap_path )
		{
		print "$program_name: existing $sitemap_path is not a "
			. "regular file, cannot back it up and do not "
			. "know what to do.\n" ;
		exit $EXIT_FAILURE ;
		}
	# make a name for the backup file, based on the file's
	# modification time
	( $existing_sitemap_dev, $existing_sitemap_ino,
	  $existing_sitemap_mode, $existing_sitemap_nlink,
	  $existing_sitemap_uid, $existing_sitemap_gid,
	  $existing_sitemap_rdev, $existing_sitemap_size, 
          $existing_sitemap_atime, $existing_sitemap_mtime,
	  $existing_sitemap_ctime, $existing_sitemap_blksize,
	  $existing_sitemap_blocks )
	  = stat ( $sitemap_path ) ;
	( $existing_sitemap_sec, $existing_sitemap_min,
	  $existing_sitemap_hour, $existing_sitemap_mday,
	  $existing_sitemap_mon, $existing_sitemap_year,
	  $existing_sitemap_wday, $existing_sitemap_yday,
	  $existing_sitemap_isdst)
	  = gmtime ( $existing_sitemap_mtime ) ;
	# YYYY-MM-DDThh:mm:ssTZD w3c profile for iso 8601 date/time
	$backup_suffix = sprintf ( ".%04d-%02d-%02dT%02d:%02d:%02dZ",
	  $existing_sitemap_year + 1900, $existing_sitemap_mon + 1,
	  $existing_sitemap_mday, $existing_sitemap_hour,
	  $existing_sitemap_min, $existing_sitemap_sec ) ;
	if ( -e $sitemap_path . $backup_suffix )
		{
		# backup file already exists
		printf "$program_name: backup file $sitemap_path"
		  . "$backup_suffix already exists\n" ;
		}
	else
		{
		@copy_args = ( "cp", "-pvn", "$sitemap_path",
		"$sitemap_path$backup_suffix" ) ;
	system ( @copy_args ) == 0
		or die "$program_name: Error: Unable to copy existing"
			. "sitemap $sitemap_path to backup "
			. "$sitemap_path$backup_suffix\n" ;
		}
	# print "backup file suffix = $backup_suffix\n" ;
	}
else
	{
	print "$program_name: no existing $sitemap_path to back up\n" ;
	}

# we've backed up the existing sitemap.xml (if any)
# start a new sitemap.xml file; we start with a temporary file, and
# don't rename until the work is done, that way if we run into problems
# walking the tree then we haven't destroyed the existing sitemap

open SITEMAP, ">" . "$sitemap_path.tmp"
  or die "$program_name: unable to open working file $sitemap_path.tmp\n" ;

# YYYY-MM-DDThh:mm:ssTZD w3c profile for iso 8601 date/time
$sitemap_lastmod = sprintf ( "%04d-%02d-%02dT%02d:%02d:%02dZ",
  $run_year + 1900, $run_mon + 1, $run_mday, $run_hour,
  $run_min, $run_sec ) ;

print SITEMAP 
  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
  . "<urlset xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
  . "  xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 "
  . "http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\"\n"
  . "  xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"
  . "<!-- sitemap.xml generated at $sitemap_lastmod\n"
  . "  by $program_name $program_version;\n"
  . "  see http://www.tatanka.com/software/map_site/index.html -->\n" ;

# now walk the directory tree starting at the site root and see what
#   file exist, and which ones are to included in the site map;
# the easiest way to do this is to let the find command do it, then we
#   read through the results and process the names

$file_list = qx/cd $website_root_directory; find -L . -type f -print/ ;
# print "file list = $file_list\n" ;
@file_list = split ( '\n', $file_list ) ;

foreach $file_from_list ( @file_list )
	{
	$file_allowed = 0 ;
	# remove initial ./
	$file_from_list =~ s/^\.\/// ;		# remove initial ./
	$file_from_list =~ m"([^/]*$)" ;	# extract filename from path
	# $directory_part = $1 ;
	$filename_part = $1 ;
	# print "$file_from_list - $directory_part - $filename_part\n" ;
	if ( substr ( $filename_part, 0, 1 ) eq "." )
		{
		# filename begins with "."
		$file_allowed = 0 ;
		}
	elsif ( substr ( $filename_part, -1, 1 ) eq "~" )
		{
		# filename ends with "~"
		$file_allowed = 0 ;
		}
	elsif ( $filename_part =~ m"\." )	# filename contains "."
		{
		# filename contains "." (but not at beginning)
		foreach $file_suffix ( @suffixes_allowed )
			{
			# print "$file_from_list - $file_suffix\n" ;
			if ( $file_from_list =~ m/\.$file_suffix$/i )
				{
				$file_allowed = 1 ;
				last ;
				}
			}
		}
	else
		{
		# filename has no suffix, we test for a "text" file
		$file_type = qx/cd $website_root_directory; file $file_from_list/ ;
		# print "          $file_type\n" ;
		if ( $file_type =~ m/text/i )
			{ $file_allowed = 1 ; }
		else
			{ $file_allowed = 0 ; }
		}
	# preclude certain specific files
	if ( ( $file_from_list eq "robots.txt" )
	  or ( $file_from_list =~ m/^sitemap.xml/ ) )
		{
		$file_allowed = 0 ;
		}
	if ( $file_allowed )
		{
		# print "  allowed: $file_from_list\n" ;
		( $selected_file_dev, $selected_file_ino,
		  $selected_file_mode, $selected_file_nlink,
		  $selected_file_uid, $selected_file_gid,
		  $selected_file_rdev, $selected_file_size, 
	          $selected_file_atime, $selected_file_mtime,
		  $selected_file_ctime, $selected_file_blksize,
		  $selected_file_blocks )
		  = stat ( $website_root_directory . $file_from_list ) ;
		( $selected_file_sec, $selected_file_min,
		  $selected_file_hour, $selected_file_mday,
		  $selected_file_mon, $selected_file_year,
		  $selected_file_wday, $selected_file_yday,
		  $selected_file_isdst)
		  = gmtime ( $selected_file_mtime ) ;
		# YYYY-MM-DDThh:mm:ssTZD w3c profile for iso 8601 date/time
		$file_lastmod_time
		  = sprintf ( "%04d-%02d-%02dT%02d:%02d:%02dZ",
		  $selected_file_year + 1900, $selected_file_mon + 1,
		  $selected_file_mday, $selected_file_hour,
		  $selected_file_min, $selected_file_sec ) ;
		print SITEMAP
		  "<url>\n"
		  . "  <loc>$website_url$file_from_list</loc>\n"
		  . "  <lastmod>$file_lastmod_time</lastmod>\n"
		  . "  </url>\n" ;
		}
	else
		{
		print "  excluded: $file_from_list\n" ;
		}
	} # end foreach $file_from_list ( @file_list )

print SITEMAP "</urlset>\n" ;

# now replace the old sitemap.xml with the temporary version
@mv_args = ( "mv", "$sitemap_path.tmp", "$sitemap_path" ) ;
system ( @mv_args ) == 0
	or die "$program_name: Error: Unable to move temporary "
		. "file $sitemap_path.tmp to $sitemap_path\n" ;

exit $EXIT_SUCCESS ;

#	end map_site.pl
