#!/usr/bin/perl

# Compare frequences of words/phrases in two different files or sets
# of files (corpora).  (c) 2010 Jim Henry III; Creative Commons
# Attribution-Noncommercial-Share Alike 3.0 United States License.
# http://jimhenry.conlang.org


###TODO fiddle with default colors to make them more readable,
# and make them and threshold vals configurable

use strict;
use warnings;

use Getopt::Long;
Getopt::Long::Configure('bundling');

my $debug = 0;

###TODO make these configurable, or make them responsive to
# real statistical significance re: absolute sizes of the 
# compared corpora
my $threshold1 = 2;
my $threshold2 = 4;

my $blank_cells = qq(<td width="10%"> <td width="40%"> \n);

my $left_file = "";
my $right_file = "";
my $output_file;

my %left_h;
my %right_h;

my @left;
my @right;

use constant L => 0;
use constant R => 1;


sub usage {
    my $scriptname = $0;
    $scriptname =~ s/.*\///;

    print STDERR <<HELP;

$scriptname - Compare frequences of words/phrases in two different files
or sets of files (corpora).  (c) 2010 Jim Henry III; Creative Commons
Attribution-Noncommercial-Share Alike 3.0 United States
License.  http://jimhenry.conlang.org

Usage: $scriptname frequency_file_1 frequency_file_2 [-o <comparison_file.html>]

Takes as input two plain-text files, each of which is the output of
frequencies.pl analyzing some text file or set of files, and writes
(to standard output or to the filename specified with the -o option)
an HTML TABLE file showing a side-by-side color-coded comparison of the
two corpora.

HELP

}


sub sizes_debug {
    return if not $debug;
    print STDERR "left array has \$\# " . $#left . ", right has " . $#right . " \n";
    print STDERR "left array has scalar " . scalar( @left )  . ", right has " . scalar ( @right ). " \n";
    print STDERR "left hash has " . scalar( %left_h )  . ", right has " . scalar ( %right_h ). " \n";

}

# called within a sort
sub compare_within {
    my @bb = @{$b};
    my @aa = @{$a};
    return ( ( $bb[0] <=> $aa[0] ) or ( $bb[1] cmp $aa[1] ) );
}


my %style = ( 0 => "n",
	      -1 => "r",
	      -2 => "mr",
	      1 => "mc",
	      2 => "mmc",
	      3 => "u" );

# called within cell formatting, to show with color-coding how much
# more or less common one word is in one corpus than the other
sub compare_across {
    my $this_word = shift;
    my $which_col = shift;
    my $this_freq = 0;
    my $other_freq;
    if ( L == $which_col ) {
	$this_freq  = $left_h{ $this_word };
	$other_freq = $right_h{ $this_word };
    } else {
	$this_freq  = $right_h{ $this_word };
	$other_freq = $left_h{ $this_word };
    }

    if ( not defined $other_freq or 0 == $other_freq ) { 
	return 3;
    }

    my $ratio = $this_freq / $other_freq;
    if ( $ratio > $threshold2 ) {
	return 2;
    } elsif ( $ratio > $threshold1 ) {
	return 1;
    } elsif ( $ratio < 1/$threshold2 ) {
	return -2;
    } elsif ( $ratio < 1/$threshold1 ) {
	return -1;
    } else {
	return 0;
    }
}


sub get_input {
    open LEFT, $left_file	or die "Can't open $left_file for reading\n";
    open RIGHT, $right_file	or die "Can't open $right_file for reading\n";
    
    my $pct = 0;
    my $count = 0;	###TODO unused, will use to set thresholds of significance
    my $text = "";
    
    while ( <LEFT> ) {
	s/[\r\n]+//;
	if ( not m/^ *([0-9]+\.[0-9]+)%\s+([0-9]+)\s+(.*)/ ) {
	    die "Bad format in $left_file -- must be output of frequencies.pl\n";
	}
	$pct = $1;
	$count = $2;
	$text = $3;
	if ( $text !~ m/^\[TOTAL/ ) {
	    print STDERR "stashing value $pct $text\n" if $debug;
	    $left_h{ $text } = $pct;
	    my @arr = ( $pct, $text );
	    push @left, \@arr;
	}
    }
    close LEFT;
    
    while ( <RIGHT> ) {
	s/[\r\n]+//;
	if ( not m/^ *([0-9]+\.[0-9]+)%\s+([0-9]+)\s+(.*)/ ) {
	    die "Bad format in $right_file -- must be output of frequencies.pl\n";
	}
	$pct = $1;
	$count = $2;
	$text = $3;
	if ( $text !~ m/^\[TOTAL/ ) {
	    $right_h{ $text } = $pct;
	    my @arr = ( $pct, $text );
	    push @right, \@arr;
	}
    }
    close RIGHT;
    
    @left  = sort compare_within @left;
    @right = sort compare_within @right;

    &sizes_debug;
}


sub print_header {
    my $title = "Comparison of word/phrase frequencies between $left_file and $right_file";
    print qq(<html><head><title>$title</title>\n);

    print qq(<meta name="generator" content="$0">\n);

    print qq(<style><!--\n)
	. qq( .u { font-style: italic;  background-color: #0F0; color: #F0F } \n)
	. qq( .mmc { background-color: #C00 ; color: #FFF } \n)
	. qq( .mc {  background-color: #800;  color: #0CF } \n)
	. qq( .r {  background-color: #00F;  color: #FF0 } \n)
	. qq( .mr {  background-color: #008;  color: #FF0 } \n)
	. qq(--></style>\n);

    print "</head><body>\n";

    print qq(<h1>$title</h1>\n);

    print qq(<p>Key:</p> \n)
	. qq(<ul>\n)
	. qq(<li class="u">unique, not in other corpus</li>\n)
	. qq(<li class="mmc">more than $threshold2 times as common as in other corpus</li>\n)
	. qq(<li class="mc">more than $threshold1 times as common as in other corpus</li>\n)
	. qq(<li class="n">between 1/$threshold1 and $threshold1 times as common as in other corpus</li>\n)
	. qq(<li class="r">less than 1/$threshold1 as common as in other corpus</li>\n)
	. qq(<li class="mr">less than 1/$threshold2 as common as in other corpus</li>\n)
	. qq(</ul>\n);
	


    print qq(<table cellspacing="5" cellpadding="5" border="1" width="100%">\n);
    print qq(<tr><th colspan=2 width="50%">$left_file\n);
    print qq(<th colspan=2 width="50%">$right_file\n);
    print qq(</tr>\n);
}


sub table_break {
    print qq(</table>\n);
    print qq(<table cellspacing="5" cellpadding="5" border="1" width="100%">\n);
}


sub print_footer {
    print "</table></body>\n";
}


sub format_cells {
    my $which_col = shift;
    my $arrref = shift;
    my @arr = @{ $arrref };
    my $pct = $arr[0];
    my $text =$arr[1];

    my $compared = &compare_across( $text, $which_col );
    my $cells = qq(<td width="10%"> ) . $pct
	. qq( <td class=") . $style{ $compared } . qq(" width="40%"> ) . $text . "\n";

    return $cells;
}


sub do_comparison {
    my $left_idx = 0;
    my $right_idx = 0;
    my $which_col = 0;
    my $rows = 0;
    
    &sizes_debug;
    
    while ( $left_idx < $#left  or  $right_idx < $#right ) {
	print STDERR "row $rows, left $left_idx, right $right_idx \n"	if $debug;
	my $left_cells = $blank_cells;
	my $right_cells = $blank_cells;

	if ( $left_idx < $#left && 
	     ( $right_idx >= $#right || 
	       $left[$left_idx]->[0] >= $right[$right_idx]->[0] ) 
	    )
	{
	    $left_cells = &format_cells( L, $left[ $left_idx ] );
	    $left_idx++;
	}

	if ( $right_idx < $#right && 
	     ( $left_idx >= $#left || 
	       $right[$right_idx]->[0] >= $left[$left_idx]->[0] ) 
	    )
	{
	    $right_cells = &format_cells( R, $right[ $right_idx ] );
	    $right_idx++;
	}

	print "<tr>\n" . $left_cells . $right_cells . "</tr>\n";

	###TODO make this configurable
	if ( $rows++ % 100 == 99 ) {
	    &table_break;
	}
    }
}



#+++++ main +++++

my $rc = GetOptions(
    	'o|output=s'	=> \$output_file,
        'd|debug'	=> \$debug,
);

if ( not $rc ) {
    &usage;
    exit(1);
}

$left_file = shift;
$right_file = shift;

my $remaining_arg = shift;
if ( defined $remaining_arg ) {
    &usage;
    exit(1);
}

if ( not defined $right_file ) {
    &usage;
    exit(1);
}

&get_input;

if ( defined $output_file ) {
    open OUT, ">$output_file"	or die "Can't open $output_file for writing\n";
    select OUT;
}

&print_header;

&do_comparison;

&print_footer;

close;

exit(0);
