
=head1 NAME

ImageSites.pm

=head1 SYNOPSIS

  use ImageSites;

  # initialize the library's configuration variables
  image_library_init(
      'scriptname' 	=> $0,
      'logfile' 	=> "images.log",
      'debug' 		=> 1,
      'timestamps' 	=> 1,
      'max_errs' 	=> 10,
      'pausetime'	=> 300,  # five minutes
      'random_wait'	=> 1,
      'do_exponential_backoff' => 1,
      'verbose'		=> 1,
      'minheight'	=> 200,
      'minwidth'	=> 200,
      'minarea'		=> 40000,
      'minima_or'	=> 0,
      'save_dir' 	=> "$ENV{HOME}/Pictures/goldenage",
      );

  # download an HTML page
  my $html_text = get_page( "http://goldenagepaintings.blogspot.com" );
  die if not $html_text;

  # get image URLs from the page
  my @images = $html_text =~ m/<img (?: [^>]+ ) src \s* = \s* ['"] ([^'"]+) ['"] (?: [^>]* )>/gix;

  # download the images, pausing after each
  for my $url ( @images ) {
      unless ( $url =~ m/^http/ ) {
	  $url = "http://goldenagepaintings.blogspot.com/" . $url;
      }
      my $img_content = get_page( $url );
      die if not $img_content;
      $url =~ s!.*/!!; # strip everything but filename
      save_image( $url, $img_content );
      randpause;
  }

=head1 DESCRIPTION

This is a library of functions that used to be shared between
crawl-web-for-images.pl and some other scripts I wrote that were tailored for
getting images from specific sites (e.g. epilogue.net and artmagick.com).
However, those other scripts became obsolete as the sites in question shut down
or changed their design over time in ways that make it hard for a crawler to get
anything useful from them, and the general-purpose crawl-web-for-images.pl is
all that's left.

=cut

package ImageSites;

use strict;
use warnings;
use Carp;
use List::Util qw( any );
use Encode;
use LWP::UserAgent;
use Data::Dumper;

our $VERSION = "1.0";

sub writelog($);
sub savepage($$);
sub maybe_quit($);
sub shutdown($);
sub randpause;
sub startlog;
sub parsetime($);
sub get_page($);
sub save_image($$);
sub too_small($$);

my %unit_multipliers = (
    	"s" => 1,
    	"m" => 60,
    	"h" => 3600,
    	"d" => 86400
    );

my $seq_http_err_count = 0;

###TODO make configurable
my $max_pausetime = 10000;	# 10 Ksec, about 2.8 hours

my %config = ();
my $ua;
my $utf8_decoder;
my $raw_decoder;


BEGIN {
    use Exporter();
    our @ISA = qw(Exporter);

    our @EXPORT = qw( 
	image_library_init
	savepage
	maybe_quit
	shutdown
	randpause
	startlog
	writelog
	parsetime
	get_page
	save_image
	too_small
	set_config_var
        match_url
        domain_name
        base_directory
 	);

    return 1;
}

=head1 FUNCTIONS

=head2 image_library_init( hash of options )

Takes a hash of options, does some sanity checking on them, creates a
LWP::UserAgent object and a utf8 decoder, and initializes the logfile.

Valid option keys:

debug                                   Turn debug mode on or off.
do_exponential_backoff                  Double sleep time after an unsuccessful GET.
logfile                                 File to write messages to.
max_errs                                Exit after this many nonfatal errors.
minarea                                 Images must be at least this big.  Use too_small to check 
					against these variables. It's the caller's responsibility
					to figure out the width and height of an image.
minheight                               "
minwidth                                "
minima_or                               The width and height criteria are or'd and not and'd -- if 
					either is satisfied, we say it's big enough.
pausetime                               Baseline sleep time, actual sleep time may vary if random_wait 
					is set
random_wait                             If true, vary the sleep time around the baseline randomly.
quiet                                   Print no output to terminal.
save_dir                                Directory to which to save files.
scriptname                              The name of the script that called us, for debugging purposes.
timestamps                              Whether to print timestamps to the log file.
verbose                                 Be more detailed in our messages.

=cut

my @valid_variable_names = qw (
    debug
    do_exponential_backoff
    logfile
    max_errs
    minarea
    minheight
    minima_or
    minwidth
    pausetime
    quiet
    random_wait
    save_dir
    scriptname
    timestamps
    verbose
    );

sub image_library_init { 
    croak "image_library_init() called with an odd number of options" if scalar @_ % 2 == 1;
    
    %config = @_;

    my $var_errs = 0;
    for my $var ( keys %config ) {
	unless ( any { $var eq $_ } @valid_variable_names ) {
	    carp "unrecognized option $var passed to image_library_init()";
	    $var_errs++;
	}
    }
    exit(1) if $var_errs;
    
    if ( not $config{scriptname} ) {
	###TODO maybe use $0 here?
	$config{scriptname} = "ImageSites.pm";
    }

    if ( defined $config{save_dir} ) {
	if ( $config{save_dir} !~ m(/$) ) {
	    $config{save_dir} =  $config{save_dir} . "/";
	}
    } else {
	$config{save_dir} = "";
    }
    
    if ( $config{logfile} ) {
        if ( $config{logfile} !~ m(/) ) {
	    $config{logfile} = $config{save_dir} . $config{logfile};
	}
	&startlog;
    }
    
    if ( $config{verbose} ) {
	foreach ( sort keys %config)  {
	    if ( defined $config{ $_ } ) {
		writelog $_ . " == " . $config{$_} . "\n";
	    }
	}
    }

    if ( not defined $config{max_errs} ) {
	$config{max_errs} = 5;
    }
    
    $ua = LWP::UserAgent->new;
    die "Couldn't create LWP::UserAgent object"  if not ref $ua;
    $ua->agent( $config{scriptname} );
    
    $utf8_decoder = Encode::find_encoding( "utf8" );
    die "Couldn't get utf8 decoder"              if not ref $utf8_decoder;
}

=head2 set_config_var( config variable name, value )

Set the specified config variable to the value.

=cut

sub set_config_var($$) {
    my $varname = shift;
    my $value = shift;
    die if not defined $value  or  not defined $varname;

    $config{ $varname } = $value;
}


=head2 savepage( filename, content )

Write the content to the specified file, with some error checking.
Use this everything except images, for which see save_image().

=cut

sub savepage($$) {
    return 	if not $config{debug};
    my $file = shift;
    if ( $file !~ m(/) and defined $config{save_dir} ) {
	$file = $config{save_dir} . $file;
    }
    my $content = shift;
    die if not defined $content;

    if ( not open SAVING, ">:raw", $file ) {
	writelog qq(open "$file" failed with error "$!"\n);
	&maybe_quit(0);
	return undef;
    }

    print SAVING $content 	or die "writing to $file failed";
    my $len = length $content;
    close SAVING;
    writelog "wrote $len bytes to $file\n";
}

#==========

=head2 parsetime( time string )

Convert a time string which may have a number followed by a unit into
a number of seconds.  Return -1 if the unit isn't recognized or the
argument otherwise doesn't match the regular expression.

=cut


sub parsetime($) {
    my $timestring = shift;
    if ( $timestring =~ m/^(\d+)([dhms])?$/ ) {
	# required part: one or more digits
	my $scalar = $1;
	return -1 	if not defined $scalar;
	# optional part: unit abbrev, s(ec) m(inutes) h(ours) d(ays)
	my $units = $2;
	if ( defined $units ) {
	    return $scalar * $unit_multipliers{ $units };
	} else { 
	    # interpret as seconds if no unit specified
	    return $scalar;
	}
    } else {
	return -1;
    }
}


#==========

=head2 randpause()

Sleep for a configured amount of time.  Typically used to wait a bit after a
download before hitting the same server again, or before doing another
http get on any server.

=cut

sub randpause {
    # do exponential backoff if we have had recent HTTP errors
    my $this_pause = $config{pausetime};
    return	if ( $this_pause <= 0 );

    if ( $config{do_exponential_backoff} ) {
	$this_pause *= ( 2 ** $seq_http_err_count );
	if ( $this_pause > $max_pausetime ) {
	    $this_pause = $max_pausetime;
	}
    }

    if ( $config{random_wait} ) {
	$this_pause = int ((rand() + 0.5) * $this_pause);
    }

    if ( $config{verbose} ) {
	writelog "waiting $this_pause seconds (basic wait == $config{pausetime} seconds, random wait " . ( $config{random_wait} ? "on" : "off" ) . ", number of recent HTTP errors == $seq_http_err_count)\n";
    }

    select undef, undef, undef, $this_pause;
}

#==========

=head2 startlog()

Initialize the log file.

=cut

my $log_fh;

sub startlog {
    die if not defined $config{logfile};
	
    open $log_fh, ">>" . $config{logfile}	or die "can't open $config{logfile} for appending";
    my $oldfh = select $log_fh;
    $| = 1;		# flush log file on every print
    select $oldfh;
    if ( not $config{timestamps} ) {
	print $log_fh "starting " . $config{scriptname} . "  at " .  localtime() . "\n";
    } else {
	writelog "starting " . $config{scriptname} . "\n";
    }
}


#==========

=head2 writelog( message )

Write the specified message to the log file and/or the terminal, depending on 
configuration variables.

=cut

sub writelog($) {
    my $arg = shift;
    if ( not $config{quiet} ) {
	print STDOUT $arg;
    }

    if ( $config{logfile} ) {
	if ( $config{timestamps} ) {
	    $arg = localtime() . "  " . $arg;
	}
	print $log_fh $arg;
    }
}


#====================

=head2 maybe_quit( fatality level )

If the argument is 1, log a message and exit.  If the argument is 0, 
keep track of how many nonfatal errors we've had and exit only if we've
exceeded that.

=cut

my $errcount = 0;
sub maybe_quit($) {
    my $is_fatal = shift;
    if ( $is_fatal ) {
	writelog "ending at " . localtime() . " because of a fatal error\n";
	&shutdown(2);
    } elsif ( ++$errcount >= $config{max_errs} ) {
	writelog "ending at " . localtime() . " because of too many nonfatal errors\n";
	&shutdown(1);
    } else {
	return;
    }
}

#==========

=head2 shutdown( exit code )

Write a log message and exit.

=cut


sub shutdown($) {
    my $exit_code = shift;
    if ( $config{logfile} ) {
	writelog "exiting with exit code $exit_code\n";
	close $log_fh;
    }
    exit $exit_code;
}

#==========

=head2 too_small( width in pixels, height in pixels )

Check an image's size against our width, height, area, and
minima_or configuration variables.  Return 1 if too small,
0 otherwise.

=cut



sub too_small($$) {
    my $width = shift;
    my $height = shift;
    if ( not defined $width or not defined $height ) {
	return 0;
    }

    my $area = $width * $height;
    my  $too_small_msg = "skipping image because at $width by $height pixels it's too small\n";

    # the logic is reversed because an "or" from the user's perspective
    # (get image if its height is at least this OR its width is at least that)
    # is an "and" when we're checking if it's too small
    if ( $config{minima_or} ) {
	if ( ( $config{minheight} && $height < $config{minheight} )
	     and ( $config{minwidth} && $width < $config{minwidth} ) ) 
	{
	    writelog $too_small_msg	if $config{verbose};
	    return 1;
	}
    } else {
	if ( ( $config{minheight} && $height < $config{minheight} )
	     or ( $config{minwidth} && $width < $config{minwidth} ) )
	{
	    writelog $too_small_msg	if $config{verbose};
	    return 1;
	}
    }

    if ( $config{minarea}  && $area < $config{minarea} ) {
	writelog $too_small_msg		if $config{verbose};
	return 1;
    }
    
    return 0;
}

=head2 save_image( filename, image content )

Save the image to the specified path and file.  If a file of that name already
exists and our new image content is larger, overwrite.  If anything goes wrong
with saving, return undef, otherwise 1 on success.

=cut

sub save_image($$) {
    my $savepath = shift;
    my $content = shift;
    my $size = length $content;

    if ( $savepath !~ m(/) and defined $config{save_dir} ) {
	$savepath = $config{save_dir} . $savepath;
    }

    if ( -e $savepath ) {
	if ( -s $savepath >= $size ) {
	    writelog qq($savepath exists and has greater or equal size as the image we just downloaded, so not saving it\n);
	    return;
	}
	unless ( unlink $savepath ) {
	    writelog qq(Couldn't delete old, smaller version of $savepath\n);
	    &maybe_quit(0);
	    return;
	}
    }
    
    if ( not open IMGFILE, ">" . $savepath ) {
	writelog "failed to open $savepath for writing\n";
	&maybe_quit(0);
	return;
    }
    
    if ( not print IMGFILE $content ) {
	writelog "failed to write to $savepath\n";
	close IMGFILE;
	&maybe_quit(0);
	return;
    }
    
    close IMGFILE;
    writelog "saved $savepath, " . ( length $content ) . " bytes\n";
    return 1;
}


=head2 match_url( first URL, second URL )

Check if two URLs are the same after doing some simple transformations
to account for the way equivalent URLs can vary.

=cut

sub match_url {
    croak "wrong number of arguments to match_url()" if scalar @_ != 2;
    my @urls = map { s!/$!!;		# remove trailing slash
		     s/^https/http/;    # don't say they're different if protocol is the only difference
		     $_;		# without this line we transform everything to the number of
		     			# https substitutions made
    } @_;
    return ( $urls[0] eq $urls[1] );
}




=head2 get_page( URL )

Get a page or image from a URL.  If the URL ends in an image file extension, but
the content-type header indicates it's actually HTML, check the page for links
and try to download the actual image (calling ourselves recursively).

If called in scalar context, return the page/image content or undef if something
went wrong. 

If called in list context, and the page was redirected, return a two-element list
of the page content and the redirected URL.

=cut

sub get_page($) {
    my $arg_url = shift;
    if ( not defined $arg_url ) {
	writelog qq(Undefined argument to ImageSites::get_page\n);
	&maybe_quit(0);
	return undef;
    }

    if ( $arg_url =~ m/^\s*$/ ) {
	writelog qq(Empty or blank argument to ImageSites::get_page\n);
	&maybe_quit(0);
	return undef;
    }

    my $url = eval { $utf8_decoder->encode( $arg_url ) };
    if ( $@ ) {
	my $eval_err = chomp $@;
	writelog qq(encode failed with error "$eval_err"\n);
	&maybe_quit(0);
	return undef;
    }

    #my $url = Encode::from_to( $arg_url, "utf8", "raw" );

    $url =~ s/([\x80-\xFF])/"%" . sprintf( "%02X", ord($1) ) /eg;
    if ( $url ne $arg_url ) {
	writelog "fixed high-bit chars in url: $arg_url > $url\n";
    }

    writelog "fixing to get $url\n";

    my $url_changed = 0;
    my $response = $ua->get( $url );
    if ( not $response->is_success ) {
	$seq_http_err_count++;
	my $status = $response->status_line;
	writelog "downloading $url failed: $status\n";
	# no real problem if a domain doesn't have robots.txt
	if ( $url !~ m([a-z]+://[^/]+/robots.txt) ) {
	    &maybe_quit(0);
	}
	return undef;
    } else {
	$seq_http_err_count = 0;

	my $prev = $response->previous;
	while ( $prev ) {
	    writelog "previous response found\n" if $config{verbose};
	    my $location = $prev->header( 'location' );
	    if ( $location ) {
		writelog "location header found: $location\n"   if $config{verbose};
		unless ( match_url( $url, $location ) ) {
		    writelog "resetting \$url to $location from previous response (was $url)\n";
		    $url = $location;
		    $url_changed = 1;
		}
	    }
	    $prev = $prev->previous;
	}

	if ( $response->is_redirect && $config{verbose} ) {
	    my @redirects = $response->redirects;
	    #if ( $config{verbose} ) {
	    writelog "Response was redirected\n"; 
	    writelog Dumper @redirects;
	}

	if ( $url =~ m/(jpe?g|svg|gif|png)$/i and $response->header("content-type") !~ m!^image! ) {
	    writelog "mismatch between filename and content-type header: $url has content-type " . $response->header("content-type") . "\n";
	    if ( $response->header("content-type") =~ m!^text/html! ) {
		writelog "going to parse HTML content and look for the real image behind this frame page\n";
		my @img_tags = ( $response->decoded_content =~ m!<img[^/]+src\s*=\s*["']([^'"]+)["']!g );
		if ( scalar @img_tags < 1 ) {
		    writelog "no image tags on page\n";
		} elsif ( scalar @img_tags > 1 ) {
		    writelog "multiple image tags on page, can't be sure which we want\n";
		    if ( $url =~ m!wiki[pm]edia.*File:([^/]+)! ) {
			my $fn_substr = quotemeta($1);
			my @versions = grep m/$fn_substr/, @img_tags;
			###TODO more logic for finding the largest-size
			# version of the image, instead of just picking the first one
			if ( scalar @versions >= 1 ) {
			    return get_page( $versions[0] );
			}
		    }
		} else {
		    return get_page( $img_tags[0] );
		}
		
	    }
	}
	if ( wantarray ) {
	    if ( $url_changed ) {
		return ( $response->decoded_content, $url );
	    } else {
		return ( $response->decoded_content );
	    }
	} else {
	    return $response->decoded_content;
	}
    }
}


=head2 base_directory( URL )

Return the base directory of an URL (stripping off the filename part).

=cut

sub base_directory($) {
    my $url = shift;
    my $base_dir = $url;
    $base_dir =~ s(/[^/]+$)(/);
    if ( $base_dir =~ m!^[a-z]+?:[0-9]*//$! ) {
	# we stripped off too much; revert to original URL, which must be a 
	# domain name with no trailing slash
	$base_dir = $url . "/";
    }
    return $base_dir;
}

=head2 domain_name( URL )

Return the domain name part of an URL.

=cut

sub domain_name($) {
    my $domain_name = shift;
    $domain_name =~ s! ([a-z]+: [0-9]* // [^/]+ ) / .* !$1!x;
    return $domain_name;
}



return 1;

=head1 DEPENDENCIES

C<strict>, C<warnings>, C<Carp>, C<List::Util> and C<Encode> are in the standard
library.  C<LWP::UserAgent> is available from CPAN.


=head1 AUTHOR

Jim Henry III, L<http://jimhenry.conlang.org/software/>


=head1 LICENSE

This script is free software; you may redistribute it and/or modify it under
the same terms as Perl itself.
