#!/usr/bin/perl

# Print frequencies of single words, N-word phrases or both in input file(s).

# (c) 2006 Jim Henry III.  Creative commons licensing.
# http://www.pobox.com/~jimhenry/conlang.htm

###TODO: Option to count wildcard phrases when looking at 3+ word phrases.  E.g. for each 3-word
#  phrase we would  count "word1 word2 word3" but also "word1 * word3".  For 4-word phrases,
#  count "word1 * word3 word4" and  "word1 word2 * word4" and  "word1 * * word4".  Etc.

###TODO: option to read a file with mappings of words to
### categories.  Whenever a word is found that's a member of that category, we increment
# the count for the category as well as for the individual word.  Possible format:

# PRON1SG: I me my mine;
# PRON1PL: we us our ours;
# PRON2: you your yours;
# [......]
# PRONOUNS: PRON1SG PRON1PL PRON2 PRON3SG PRON3PL;

# For cleanness of format probably we wrap each in /^$categ[$j]$/ to search for whole
# word.  But user can do something like marr[iy].* to get marry, married, marriage, etc.

###TODO: print percentages as well as/instead of totals?

###TODO: sort before output instead of requiring user  to filter it through sort utility

use strict;

my $debug = 0;
my $minphraselen = 1;
my $maxphraselen = 1;

my $arg;

while ( $arg = shift ) {
    print "processing \$arg == $arg\n" if $debug;

    # stop processing arguments if find '--' or an argument that 
    # doesn't start with '-'; in the latter cast, first put the arg
    # back for processing by <ARGV>

    if ( $arg eq "--" ) {
	last;
    }
    if ( $arg =~ m/^[^-]/ ) {
	unshift @ARGV, $arg;
	last;
    }

    if ( $arg eq "-d" ) {
	$debug = shift;
	if ( $debug !~ m/^[0-9]+$/ ) { 	# bare -d with no number argument
	    unshift @ARGV, $debug;
	    $debug = 1;
	}
	print "setting \$debug to $debug\n";
	next;
    }
    if ( $arg eq "-w" ) {
	$minphraselen = $maxphraselen = 1;
	next;
    }
    if ( $arg eq "-p" ) {
	my $nextarg = shift;
	if ( $nextarg !~ m/^[1-9]+$/ ) { # too small or non-numeric
	    print "defaulting phrase length to 2\n"  if $debug;
	    unshift @ARGV, $nextarg;
	    $minphraselen = $maxphraselen = 2;
	} else {
	    $minphraselen = $maxphraselen = $nextarg;
	}
	next;
    }

    # range of lengths
    if ( $arg eq "-r" ) {
	my $range = shift;
	if ( $range =~ m/^([1-9]+)-([1-9]+)$/ ) {
	    $minphraselen = $1;
	    $maxphraselen = $2;
	    if ( $minphraselen > $maxphraselen ) {
		print STDERR "-r argument range values must be lesser to greater (or equal)\n";
		&display_usage;
		exit;
	    }
	    next;
	} else {
	    print STDERR "After -r next argument must have form N-M where N and M are positive integers\n";
	    &display_usage;
	    exit;
	}
    }

    # unknown argument, or -? or -h
    if ( $arg =~ m/^-/ ) {
	&display_usage;
	exit;
    }
}

my @phrasefreq;
my %wordfreq;

$/ = '';  		# read in a whole paragraph at a time, not just one line
my @words;

my $paranum = 0;
my $total_wordcount = 0;

#while ( <> ) {
while ( <ARGV> ) {
    $paranum++;
    # splitting on apostrophe
#    @words = split /[-\s.?!,;:\'\"()\[\]]+/;

    @words = split /[-\s.?!,;:\"()\[\]]+/;
    my $wordcount = scalar(@words);
    $total_wordcount += $wordcount;

    for ( my $i = 0; $i < $wordcount; $i++ ) {
	if ( $words[$i] =~ m/^$/ ) {
	    # results from split when a paragraph starts with space or punctuation
	    $total_wordcount--;
	    next;
	}
	print "para $paranum word " . (1 + $i) . " = [" . $words[$i] . "]\n"  if $debug >= 3;
	if ( $debug && $words[$i] =~ m/\s/ ) {
	    print "warning: $paranum word " . (1 + $i) . " = [" . $words[$i] . "]\n";
	}
	for ( my $j = $maxphraselen; $j >= $minphraselen; $j-- ) {
	    print "working on $j word phrases\n"  if $debug >= 3;
	    # if there are at least $j words left in the para, add one instance for
	    # the $j-word phrase starting with the current word

	    my $words_left = $wordcount - $i;
	    if ( $words_left  >= $j ) {
		print "have $words_left words left so will count the $j word phrase starting with " . $words[ $i ] . "\n"  if $debug >= 3;

		my $phrase;
		for ( my $k = $i; $k <= $i + $j - 1; $k++ ) {
		    if ( $k > $i ) {
			$phrase .= " ";
		    }
		    $phrase .= $words[ $k ];
		}
		$phrasefreq[ $j - 1 ]{ $phrase }++;
	    }
	}
    } 
    if ( $debug >= 2 ) {
	print "\nanalyzed paragraph:\n" . $_ . "\nwith $wordcount words; and so far, frequencies are:\n";
	&print_frequencies;
    }
}

&print_frequencies;

print $total_wordcount . "\t" . "[TOTAL WORDS]";

##################################################

sub print_frequencies {

    for( my $m = ($minphraselen - 1); $m < $maxphraselen; $m++ ) {
	foreach ( keys %{ $phrasefreq[ $m ] } ) {
	    print $phrasefreq[ $m ]{ $_ } . "\t" . $_ . "\n";
	}
    }
}

sub display_usage {

    print STDERR <<HELP;

Usage: $0   [-w] [-p N] [-r M-N] [-h] [-d N] [--] <files...>

-w           only count individual words
-p N         count phrases of N words (1+, default 2)
-r M-N       count phrases of M to N words (M can be = 1)
-h           Get help (this message)
-d N         set debug trace level to N (0-3, default 1)
--           stop processing arguments and assume any remaining
             ones are filenames

<files...>   Optional input files to use as the corpus to analyze,
             or read standard input.

HELP

}
