#!/usr/bin/perl

# Print frequencies of single words, N-word phrases or both in input file(s).

# (c) 2006-2009 Jim Henry III.  Creative commons licensing.
# http://www.pobox.com/~jimhenry/conlang.htm

###TODO: fix handling of apostrophes and dashes.  The latter at least
# should be stripped off if they're at the edge of an English word,
# and treated as a delimter if they occur in pairs; e.g. in Gutenberg etexts
# you have em dashes as -- linking two words with no spacing on either side.
# should not be treated the same as a hyphenated single word.
# similarly with apostrophes which often act as single quotes, more
# often prob. than as contraction or possessive marker when at word 
# boundary.  exceptions: 
# 'tis
# boys'
# ...etc

###TODO: fix defect where phrases are counted across sentence boundaries.  E.g., if
# input has:

#"And tell me how he looks. Look at him closely, Fritz. See if he is
#well and seems strong. Oh, and make him merry and happy! Bring that
#smile to his lips, Fritz, and the merry twinkle to his eyes.

# then "looks Look", "Fritz See", "strong Oh" etc should not be among
# the two-phrases we count.

# Tried doing this with the setting of $/ but it doesn't take a
# character class regex as its value.

### TODO: make the set of characters used as word delimiters a command-line option?
# or config file option?  anyway not hard-coded.

###TODO: Option to count wildcard phrases when looking at 3+ word phrases.  E.g. for each 3-word
#  phrase we would  count "word1 word2 word3" but also "word1 * word3".  For 4-word phrases,
#  count "word1 * word3 word4" and  "word1 word2 * word4" and  "word1 * * word4".  Etc.

###TODO: option to read a file with mappings of words to
### categories.  Whenever a word is found that's a member of that category, we increment
# the count for the category as well as for the individual word.  Possible format:

# PRON1SG: I me my mine;
# PRON1PL: we us our ours;
# PRON2: you your yours;
# [......]
# PRONOUNS: PRON1SG PRON1PL PRON2 PRON3SG PRON3PL;

# For cleanness of format probably we wrap each in /^$categ[$j]$/ to search for whole
# word.  But user can do something like marr[iy].* to total the counts of "marry, married, marriage," etc.

###TODO: option to sort in either order (should put totals in hash
###before sort-print to simplify code)

use strict;

my $debug = 0;
my $minphraselen = 1;
my $maxphraselen = 1;
my $lowercasing = 0;
my $lowerbound = 0;



# this one doesn't suit for gzb because it splits on ' and doesn't split on { }
#    @words = split /[-\s.?!,;:\'\"()\[\]]+/;

# this one works for gzb but splits morphemes
#    @words = split /[-\s.?!,}{;:\"()\[\]]+/;

# this one works for gzb, splitting words but not morphemes
#  @words = split /[\s.?!,}{;:\"()\[\]]+/;

my $delimiters = '[ \s.?!,}{;:\"()\[\]]+';

my $arg;

while ( $arg = shift ) {
    print "processing \$arg == $arg\n" if $debug;

    # stop processing arguments if find '--' or an argument that 
    # doesn't start with '-'; in the latter case, first put the arg
    # back for processing by <ARGV>

    if ( $arg eq "--" ) {
	last;
    }

    if ( $arg =~ m/^[^-]/ ) {
	unshift @ARGV, $arg;
	last;
    }

    if ( $arg eq "-d" ) {
	$debug = shift;
	if ( not defined $debug or $debug !~ m/^[0-9]+$/ ) { 	# bare -d with no number argument
	    unshift @ARGV, $debug;
	    $debug = 1;
	}
	print "setting \$debug to $debug\n";
	next;
    }

    if ( $arg eq "-w" ) {
	$minphraselen = $maxphraselen = 1;
	next;
    }

    if ( $arg eq "-p" ) {
	my $nextarg = shift;
	if ( $nextarg !~ m/^[1-9]+$/ ) { # too small or non-numeric
	    print "defaulting phrase length to 2\n"  if $debug;
	    unshift @ARGV, $nextarg;
	    $minphraselen = $maxphraselen = 2;
	} else {
	    $minphraselen = $maxphraselen = $nextarg;
	}
	next;
    }

    # lowercase everything
    if ( $arg eq "-l" ) {
	$lowercasing = 1;
	next;
    }

    # ignore (on reporting) words/phrases occurring N or fewer times
    if ( $arg eq "-m" ) {
	$lowerbound = shift;
	if ( not defined $lowerbound or $lowerbound !~ m/^[1-9][0-9]*$/ ) {
	    print STDERR "\n-m argument must be followed by nonnegative integer\n";
	    &display_usage;
	    exit;
	}
	next;
    }

    if ( $arg eq "-D" ) {
	my $delimiters_filename = shift;
	if ( not defined $delimiters_filename ) {
	    print STDERR "\n-D argument must be followed by name of delimiters file\n";
	    &display_usage;
	    exit;
	}
	open DELIM, $delimiters_filename or die qq(could not open delimiters file "$delimiters_filename"\n);
	$delimiters = <DELIM>;
	chomp $delimiters;
	next;
    }

    # range of lengths
    if ( $arg eq "-r" ) {
	my $range = shift;
	if ( $range =~ m/^([1-9]+)-([1-9]+)$/ ) {
	    $minphraselen = $1;
	    $maxphraselen = $2;
	    if ( $minphraselen > $maxphraselen ) {
		print STDERR "\n-r argument range values must be lesser to greater (or equal)\n";
		&display_usage;
		exit;
	    }
	    next;
	} else {
	    print STDERR "\nAfter -r next argument must have form N-M where N and M are positive integers\n";
	    &display_usage;
	    exit;
	}
    }

    # unknown argument, or -? or -h
    if ( $arg =~ m/^-/ ) {
	&display_usage;
	exit;
    }
}

my @phrasefreq;
my %wordfreq;

$/ = '';  		# read in a whole paragraph at a time, not just one line

##$/ = "\n";
my @words;

my $paranum = 0;
my @totals;

#while ( <> ) {
while ( <ARGV> ) {
    if ( $lowercasing ) {
	$_ = lc $_;
    }

    $paranum++;

    @words = split /$delimiters/o;

    my $wordcount = scalar(@words);

    for ( my $i = 0; $i < $wordcount; $i++ ) {
	if ( $words[$i] =~ m/^$/ ) {
	    # results from split when a paragraph starts with space or punctuation
	    next;
	}
	print "para $paranum word " . (1 + $i) . " = [" . $words[$i] . "]\n"  if $debug >= 3;
	if ( $debug && $words[$i] =~ m/\s/ ) {
	    print "warning: $paranum word " . (1 + $i) . " = [" . $words[$i] . "]\n";
	}
	for ( my $j = $maxphraselen; $j >= $minphraselen; $j-- ) {
	    print "working on $j word phrases\n"  if $debug >= 3;
	    # if there are at least $j words left in the para, add one instance for
	    # the $j-word phrase starting with the current word

	    my $words_left = $wordcount - $i;
	    if ( $words_left  >= $j ) {
		print "have $words_left words left so will count the $j word phrase starting with " . $words[ $i ] . "\n"  if $debug >= 3;
		$totals[$j-1]++;
		my $phrase;
		for ( my $k = $i; $k <= $i + $j - 1; $k++ ) {
		    if ( $k > $i ) {
			$phrase .= " ";
		    }
		    $phrase .= $words[ $k ];
		}
		$phrasefreq[ $j - 1 ]{ $phrase }++; 
	    }
	}
    } 
    if ( $debug >= 2 ) {
	print "\nanalyzed paragraph:\n" . $_ . "\nwith $wordcount words; and so far, frequencies are:\n";
	&print_frequencies;
    }
}

&print_frequencies;


##################################################

sub print_frequencies {
    my $percent;

    for( my $m = ($minphraselen - 1); $m < $maxphraselen; $m++ ) {
	if ( $m == 0 ) {
	    #printf "100.00000%\t" . $totals[0] . "\t" . "[TOTAL WORDS]\n";
	    printf "100.00000%\t%7d\t[TOTAL WORDS]\n", $totals[0];
	} else {
	    #print "100.00000%\t" . $totals[$m] . "\t" . "[TOTAL " . ($m + 1) . "-WORD PHRASES]\n";
	    printf "100.00000%\t%7d\t[TOTAL %d-WORD PHRASES]\n", $totals[$m], ($m + 1);
	}

	my $hashref = $phrasefreq[ $m ];
	foreach ( sort { $$hashref{$b} <=> $$hashref{$a} } keys %$hashref ) {
	    if ( $phrasefreq[ $m ]{ $_ } < $lowerbound ) {
		last;
	    }
	    #$percent = sprintf( " ", 100 * ( $phrasefreq[ $m ]{ $_ } / $totals[$m] ) );
	    #print $percent . "\t" . $phrasefreq[ $m ]{ $_ } . "\t" . $_ . "\n";
	    $percent = 100 * ( $phrasefreq[ $m ]{ $_ } / $totals[$m] );
	    printf " %02.5f%%\t%7d\t%s\n", $percent, $phrasefreq[ $m ]{ $_ }, $_;
	}
    }
}

sub display_usage {

    print STDERR <<HELP;

Usage: $0   [-w] [-p (N)] [-r M-N] [-D <filename>] [-m N] [-h] [-d (N)] [--] <files...>

-w              only count individual words
-p N            count phrases of N words (1+, default 2)
-r M-N          count phrases of M to N words (M can be = 1)
-l              lowercase all words before counting instances
-D <filename>   read file with delimiters regular expression
-m N            print frequency only of words or phrases that occur N or more times
-h              Get help (this message)
-d N            set debug trace level to N (0-3, default 0)
--              stop processing arguments and assume any remaining
                ones are filenames

<files...>      Optional input files to use as the corpus to analyze,
                or read standard input.

HELP

}
