#!/usr/bin/perl -w

# (c) 2006 Jim Henry III.  Creative commons licensing.

use strict;

my $debug = 0;
#my $minphraselen = 1;
#my $maxphraselen = 3;

my $phraselen = 3;

#=pod
my $arg;

while ( $arg = shift ) {
    print "processing \$arg == $arg\n" if $debug;

    # stop processing arguments if find '--' or an argument that 
    # doesn't start with '-'; in the latter cast, first put the arg
    # back for processing by <ARGV>

    if ( $arg eq "--" ) {
	last;
    }
    if ( $arg =~ m/^[^-]/ ) {
	unshift @ARGV, $arg;
	last;
    }

    if ( $arg eq "-d" ) {
	$debug = shift;
	if ( $debug !~ m/[0-9]+/ ) { 	# bare -d with no number argument
	    unshift @ARGV, $debug;
	    $debug = 1;
	}
	print "setting \$debug to $debug\n";
	next;
    }
    if ( $arg eq "-w" ) {
	$phraselen = 1;
	next;
    }
    if ( $arg eq "-p" ) {
	$phraselen = shift;
	if ( $phraselen !~ m/[0-9]+/ ) {
	    print "defaulting \$phraselen to 2\n"  if $debug;
	    $phraselen = 2;
	    unshift @ARGV, $phraselen;
	}
	next;
    }

    # unknown argument, or -? or -h
    if ( $arg =~ m/^-/ ) {
	&display_usage;
	exit;
    }
}
#=cut

my @phrasefreq;
my %wordfreq;

$/ = '';  		# read in a whole paragraph at a time, not just one line
my @words;

my $paranum = 0;
my $total_wordcount = 0;

#while ( <> ) {
while ( <ARGV> ) {
    $paranum++;
    @words = split /[-\s.?!,:\'\"()\[\]]+/;
    my $wordcount = scalar(@words);
    $total_wordcount += $wordcount;

    for ( my $i = 0; $i < $wordcount; $i++ ) {
	if ( $words[$i] =~ m/^$/ ) {
	    # results from split when a paragraph starts with space or punctuation
	    $total_wordcount--;
	    next;
	}
	print "para $paranum word " . (1 + $i) . " = [" . $words[$i] . "]\n"  if $debug >= 3;
	if ( $debug && $words[$i] =~ m/\s/ ) {
	    print "warning: $paranum word " . (1 + $i) . " = [" . $words[$i] . "]\n";
	}
	for ( my $j = $phraselen; $j >= 1; $j-- ) {
	    print "working on $j word phrases\n"  if $debug >= 3;
	    # if there are at least $j words left in the para, add one instance for
	    # the $j-word phrase starting with the current word

	    my $words_left = $wordcount - $i;
	    if ( $words_left  >= $j ) {
		print "have $words_left words left so will count the $j word phrase starting with " . $words[ $i ] . "\n"  if $debug >= 3;

		my $phrase;
		for ( my $k = $i; $k <= $i + $j - 1; $k++ ) {
		    if ( $k > $i ) {
			$phrase .= " ";
		    }
		    $phrase .= $words[ $k ];
		}
		$phrasefreq[ $j - 1 ]{ $phrase }++;
	    }
	}
    } 
    if ( $debug >= 2 ) {
	print "\nanalyzed paragraph:\n" . $_ . "\nwith $wordcount words; and so far, frequencies are:\n";
	&print_frequencies;
    }
}

&print_frequencies;

print $total_wordcount . "\t" . "[TOTAL]";

##################################################

sub print_frequencies {

    for( my $m = 0; $m < $phraselen; $m++ ) {
	foreach ( keys %{ $phrasefreq[ $m ] } ) {
	    print $phrasefreq[ $m ]{ $_ } . "\t" . $_ . "\n";
	}
    }
}

sub display_usage {

    print<<HELP;

Usage: $0  [-d N] [-r] <files...>

-h           Get help (this message)
-d N         set debug trace level to N (0-3, default 1)
-w           only count individual words
-p N         count phrases of up to N words (1+, default 2)
--           stop processing arguments and assume any remaining
             ones are filenames

<files...>   Optional input files to use as the corpus to analyze,
             or read standard input.

HELP

}

# -r M-N       count phrases of M to N words (M can be = 1)

