
package WeightRandomList;

=head1 NAME

WeightRandomList.pm

=head1 SYNOPSIS

    # Read weights from a file and then use them to randomize a list of songs

    use WeightRandomList;

    my @songs = split "\n", `find $ENV{HOME}/Music -name *.mp3`;
    my $weights_file = $ENV{HOME} . "/.songweights";
    my %weights = %{ weights_from_file( $weights_file ) };
    my @songs_random = @{ make_weighted_list( \%weights, \@songs ) };

    # Declare a weights hash and then use it to randomize @pictures

    my %pic_weights = ( "Waterhouse" => 3, 
	"Jack Kirby" => 5, 
	"family_photos" => 2, 
	"thumbnail" => 0 );
    my @pictures = split "\n", `find $ENV{HOME}/Pictures -name *.jpg`;
    my @pics_random = @{ make_weighted_list( \%pic_weights, \@pictures ) };

    # The same, but with debugging messages

    WeightRandomList::set_debug(1);
    my @pics_random = @{ make_weighted_list( \%pic_weights, \@pictures ) };

    # The same, but with the weights inverted (all nonzero weights changed 
    # to their reciprocal).

    WeightRandomList::set_inverse(1);
    my @pics_random = @{ make_weighted_list( \%pic_weights, \@pictures ) };

    # Build a weighted list yourself by getting weights for each string as you
    # decide whether to add them to a given list and how many copies to add.
    # (The code below won't randomize the list, just build a list with multiple
    # copies of the things with weights >= 2 and none of the items with weights < 1.)

    my @list;
    while ( my $str = <> ) {
        my $working_weight = calc_weight( $str, \%pic_weights );
	while ( $working_weight >= 1 ) {
	    push @list, $str;
	    --$working_weight;
	}
    }

=head1 DESCRIPTION

WeightRandomList is a module designed to randomize lists of strings while applying
weights to them, i.e., checking whether one or more regular expressions match each
string and inserting zero, one, or many copies of the string into the resulting
randomized list depending on the weight associated with the regular expression.

You can build a weights hash in your own code and pass it to make_weighted_list()
or read  weights from a file with weights_from_file().

I use it in a number of ways, in creating slideshows of random images, shuffling
random songs from my music library, copying a random subset of songs to my mp3
player, in a web crawler that downloads random images, and in my
textual slideshow, which slowly scrolls a random series of paragraphs from text files
on one's hard drive (available from L<my website|http://jimhenry.conlang.org/software/>).

=cut

use strict; 
use warnings;
use List::Util 'shuffle';

our $VERSION = "1.0";
my $debug = 0;
my $inverse = 0;

BEGIN {
    use Exporter();
    our @ISA = qw(Exporter);

    our @EXPORT = qw( 
        &make_weighted_list
	&weights_from_file
 	&set_debug
 	&set_inverse
        &calc_weight 
    );

    return 1;
}

=head1 FUNCTIONS

=head2 weights_from_file( <filename> )

Read a configuration file and build a hash suitable for passing by
ref to make_weighted_list.  The file should have regex/weight pairs,
one per line, separated by tabs.  Blank lines are OK.  '#' starts
a comment, unless it's preceded by '\', i.e. you can put \# in
a regex to match a literal #.

=cut

sub weights_from_file {
    my $filename = shift;
    open WEIGHTS, $filename	or die "can't open $filename for reading\n";

    local $/ = "\n";
    
    my %weights;
    my $regex;
    my $weight;
    my $otherstuff;
    my $origline = "";
    while ( <WEIGHTS> ) {
	$origline = $_;
	print "line before removing comments:\n$_"	if $debug;

	# a '#' sign preceded by '\' does not begin a comment
	s/^#.*//;		# remove comment on line by itself:
	s/([^\\])#.*/$1/;	# remove comment following other text
	print "line after removing comments:\n$_"	if $debug;

	next if m/^\s*$/;
	s/\s+$//;		# remove trailing whitespace
	undef $weight;
	($regex, $weight, $otherstuff) = split /\t/;
	if ( not defined $weight ) {
	    $weight = 2;
	} elsif ( not $weight =~ m/^[0-9.]+$/ ) {
	    warn "$filename line $.: second field should be numeric\n$origline\n";
	    $weight = 2;
	}

	if ( not $inverse or 0 == $weight ) {
	    $weights{ $regex } = $weight;
	} else {
	    $weights{ $regex } = ( 1 / $weight );
	}

	if ( defined $otherstuff and $otherstuff ne "" ) {
	    warn "$filename line $.: extraneous stuff after regex and weight being ignored\n$origline\n";
	}
    }

    if ( not scalar keys %weights ) {
	warn "$filename: could not find any regex-weight pairs\n";
    }
    return \%weights;
}

#==========


=head2 make_weighted_list( <weights hash ref>, <list array ref>, [<default weight>] );

Take a list of strings and produce a randomized list where the number of
copies of each element of the original list is determined by weights.  The
weights are pairs of regular expressions and floating-point numbers.  If a
regex has weight 2, every string matching that regex in the original list will
appear twice in the randomized list; if it has weight 0.5, roughly half the
strings matching that regex will appear in the randomized list; if it has
weight 1.5, roughly half the strings matching that regex will appear once
while the rest appear twice.  Weight 0 means don't include any strings
matching that regex.

A string matching two or more regexes will appear a number of times
corresponding to the I<product> of those weights.

Normally, strings matching none of the weight regexes will appear
once.  If the optional default weight argument is given, 
strings matching none of the weight regexes will appear that many
times instead (usually you would pass  zero when you only want
strings matching one or more regexes).

=cut

sub make_weighted_list {
    my $weights_ref = shift;
    my $list_ref = shift;
    my $default_weight = shift;
    if ( not defined $default_weight ) {
	$default_weight = 1;
    }

    if ( not defined $list_ref or ref $list_ref ne "ARRAY"
	or not defined $weights_ref or ref $weights_ref ne "HASH"
	or $default_weight !~ m/^[0-9.]+$/ ) {
	die "bad arguments to WeightRandomList::make_weighted_list()";
    }

    my %weights = %{ $weights_ref };
    if ( $debug ) {
	my $k = scalar ( keys %weights );
	print "$k weights\n";
    }
    my @input_list = @{ $list_ref };
    my @med_list;

    foreach ( @input_list ) {
	my $str = $_;
	my $matches = 0;
	my $overall_weight = 1;
	my $copies = 0;
	###TODO maybe rewrite using calc_weight()? 
	foreach ( keys %weights ) {
	    my $regex = $_;
	    if ( $str =~ m/$regex/ ) {
		$matches++;
		print "matched on $regex\n"	if $debug;
		my $weight = $weights{ $regex };
		$overall_weight *= $weight;
	    }
	}

	if ( not $matches ) {
	    $overall_weight *= $default_weight;
	}

	my $working_weight = $overall_weight;
	while ( $working_weight >= 1 ) {
	    push @med_list, $str;
	    --$working_weight;
	    ++$copies;
	}

	if ( $working_weight > 0 && (rand) < $working_weight ) {
	    push @med_list, $str;
	    ++$copies;
	}

	if ( $debug ) { # 	&& $overall_weight != 1 ) { 
	    print "added $copies copies of $str with weight $overall_weight\n";
	}
    }

    #my @output_list = sort { int ( rand 100000001 ) - 50000000 } @med_list;
    my @output_list = shuffle @med_list;
    return \@output_list;
}

=head2 calc_weight( <string>, <weights hash reference>, [<default weight>] )

Calculate the overall weight of the regular expressions that apply to a 
particular string.

=cut

sub calc_weight {
    my $str = shift;
    my $weights_ref = shift;
    my $default_weight = shift;
    if ( not defined $default_weight ) {
	$default_weight = 1;
    }

    if ( not defined $weights_ref or ref $weights_ref ne "HASH"
	or $default_weight !~ m/^[0-9]+$/ ) {
	die "bad arguments to WeightRandomList::calc_weight()";
    }

    my %weights = %{ $weights_ref };

    my $matches = 0;
    my $overall_weight = 1; # $default_weight;
    foreach ( keys %weights ) {
	my $regex = $_;
	if ( $str =~ m/$regex/ ) {
	    $matches++;
	    print "matched on $regex\n"	if $debug;
	    my $weight = $weights{ $regex };
	    $overall_weight *= $weight;
	}
    }

    if ( not $matches ) {
	$overall_weight *= $default_weight;
    }
    
    return $overall_weight;
}


=head2 set_inverse( <boolean> )

Turn on or off inverse weights.  If this is true, weights_from_file()
will invert all nonzero weights, replacing them with their reciprocal.

(I don't currently remember what use case I had in mind when I added
this feature, but I don't see a reason to remove it.)

=cut

sub set_inverse {
    $inverse = shift;
}


=head2 set_debug( <boolean> )

Turn on or off debugging messages.

=cut

sub set_debug {
    $debug = shift;
    if ( not defined $debug or $debug !~ m/^[0-9]+$/ ) {
	warn "bad value passed to WeightRandomList::set_debug";
	$debug = 0;
    }
}

return 1;

=head1 AUTHOR

Jim Henry III, L<http://jimhenry.conlang.org/software/>

=cut

=head1 LICENSE

This library is free software; you may redistribute it and/or modify it under
the same terms as Perl itself.

=cut

