#!/usr/bin/perl

# Parse words from standard input in Kamakawi phonology into
# syllables, then print frequencies of each syllable form overall, in
# initial, medial and final positions.

# (c) 2008 Jim Henry III.  Creative commons licensing.
# http://www.pobox.com/~jimhenry/conlang.htm

use strict;

my %allsylls;
my %initsylls;
my %medialsylls;
my %finalsylls;

while ( <> ) {
    s/[\r\n]+//g;
    my $word = lc $_;

    # get rid of homophone subscripts
    $word =~ s/[1-9]+//g;

    # get rid of stuff in parens
    $word =~ s/\([-a-zBKP]*\)//g;

    # skip prefixes and suffixes for now
    if ( /^\s*-/ or /-\s*$/ ) {
	next; 
    }

    # fold accented vowels into nonaccented equivalents
    # and change |'| and |v| into /h/ and /f/
    $word =~ tr/'v/ieaouhf/;

    # syllabify, insert spaces before each syllable boundary
    $word =~ s/(['hfvptklywnm]*[aeiou])/ \1/g;

    ### print $word . "\n";  # temp sanity check

    # get rid of initial space
    $word =~ s/^ //;

    if ( $word =~ m/^\s*$/ ) {
        next;
    }

    my @sylls = split( / /, $word );


    foreach ( @sylls ) {
        $allsylls{ $_ }++;
    }
    $initsylls{ $sylls[0] }++;
    $finalsylls{ $sylls[ $#sylls ] }++;

    if ( $#sylls >= 2 ) {
         for ( my $j = 1; $j < $#sylls; $j++ ) {
	      $medialsylls{ $sylls[ $j ] }++;
         }
    }

} # end while ( <> )

print "Counts of all syllables:\n";
foreach ( sort { $allsylls{ $b } <=> $allsylls{ $a } } keys %allsylls ) {
    print $_ . "\t" .  $allsylls{ $_ } . "\n";
}


print "\n=============\n\n";

print "Counts of intial syllables:\n";

foreach ( sort  { $initsylls{ $b } <=> $initsylls{ $a } } keys %initsylls ) {
    print $_ . "\t" .  $initsylls{ $_ } . "\n";
}

print "\n=============\n\n";
print "Counts of medial syllables:\n";

foreach ( sort { $medialsylls{ $b } <=> $medialsylls{ $a } }  keys %medialsylls ) {
    print $_ . "\t" .  $medialsylls{ $_ } . "\n";
}

print "\n=============\n\n";

print "Counts of final syllables:\n";

foreach ( sort  { $finalsylls{ $b } <=> $finalsylls{ $a } } keys %finalsylls ) {
    print $_ . "\t" .  $finalsylls{ $_ } . "\n";
}
