#/bin/perl 
 
#eval '$'.$1.'$2;' while $ARGV[0] =~ /^([A-Za-z_0-9]+=)(.*)/ && shift; 
			# process any FOO=bar switches 
 

#$, = ' ';		# set output field separator 
#$\ = "\n";		# set output record separator 


use strict;

use constant ALPHABETICAL => 1;
use constant CATEGORICAL => 2;

sub unicode_convert( $ );
sub make_hyperlink( $ );
sub linkify( $ );
sub make_anchor( $ );
sub fix_protected( $ );

my $arg;
my $mode = ALPHABETICAL;

while ( $arg = shift ) {
    if ( $arg =~ m/^-c$/ ) {
	$mode = CATEGORICAL;
    } else {
	print STDERR "Unknown command line option $arg, ignoring it\n";
    }
}

my $entry_count = 0;

my $Word;
my $CV_type;
my $length;
my $gloss;
my $categ;
my $date_added;
my $roots;
my $derived;

my $anchor;

my $title = "Lexicon (alphabetical)";

if ( $mode == CATEGORICAL ) {
    $title = "Lexicon (categorical)";
}

print<<HEADER;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN"
       "http://www.w3.org/TR/REC-html40/strict.dtd">
<html>
<head>
<title>
s&#xE4;b zjed'a: $title
</title>
<meta name = "keywords" "content = "lexicon constructed language">
<meta name = "generator" content="gen-lexicon-c13.pl">
<style> BODY { font-family: "Lucida Sans Unicode", sans-serif } </style>
</head>
<body>
<h1>
Conlang #13, phase 1: 
<a href="intro.htm">s&#xE4;b zjed'a</a> <br>
$title
</h1>

<DL>
HEADER


my @rows;
my @lexicon_entries;

 
while (<>) { 
    #chomp;	# strip record separator 
    s/[\r\n]+$//;

###    ( $Word, $RootType, $gloss, $categ, $date_added, $etym ) = split("\t", $_, 9999); 
    my @fields = split( "\t", $_ );
    $gloss = $fields[4];

    if ( $gloss !~ /^ *$/ ) { 
	push @rows, \@fields;
	push @lexicon_entries, $fields[0];
	$entry_count++; 
	if ( $fields[0] =~ /[ ']/ ) {
		$derived++;
	} else {
		$roots++;
	}
    }
}


my $line = 0;
# sort array and print output for each line of it

if ( $mode == ALPHABETICAL ) {
	@rows = sort { 
		my @p = @{$a};
		my @q = @{$b}; 
		return $p[0] cmp $q[0]; 
	} @rows;
} elsif ( $mode == CATEGORICAL ) {
	@rows = sort { 
		my @p = @{$a}; 
		my @q = @{$b}; 
		return ( $p[3] cmp $q[3] ) || ( $p[0] cmp $q[0] );
	} @rows;
} else {
	die "bad value for \$mode == $mode";
}


my $last_categ = "";
foreach ( @rows ) {
	( $Word, $CV_type, $length, $categ, $gloss, $date_added ) = @{ $_ };
	if ( $categ ne $last_categ && $mode == CATEGORICAL ) {
		my $catheader = unicode_convert( linkify( $categ ) );
		$anchor = make_anchor( $categ );
		my $line = qq(</DL>\n<h2 id="$anchor">\n) . $catheader . qq(\n</h2>\n<DL>\n);
		$line =~ s/__//g;
		print $line;
		$last_categ = $categ;
	}



	my $output_item = "";
	$anchor = make_anchor( $Word ); 

	# TODO: factor out similar code in linkify() and call that function here
	# instead of duplicating it (can't simply call linkify() here or it will
	# link each compound word to itself)
        if ( $Word =~ /[' ]/ ) {
	    # split on morphemes, link each morpheme to another entry
	    my @morphemes = split( /([' ])/, $Word );
	    my $linkedword = "";
	    my $j = 0;
	    for ( $j = 0; $j < scalar(@morphemes); $j++ ) {
		if ( grep { $_ eq @morphemes[$j] } @lexicon_entries ) {
		    $linkedword .= make_hyperlink( $morphemes[$j] );
		} else {
		    $linkedword .= $morphemes[$j];
		}
	    }
	    $Word = $linkedword;
        }

	$output_item .= fix_protected( "<DT id=\"" . $anchor . "\">" .  unicode_convert( $Word ) . "\n" ); 

	if ( $gloss =~ m/\{.+\}/ ) {
	    $gloss =~ s/(\{[^}]+\})/linkify($1)/ge;
	}
	$output_item .= fix_protected( '<DD>' . unicode_convert( $gloss ) . "\n" );

	print $output_item;

} # end for each line in lexicon 
 
print "</DL>\n"; 
 
print '<hr>'; 
 
print "<table cellspacing=5>\n"; 
print '<tr><td> Total root morphemes: <td> ' . $roots . "\n"; 
print '<tr><td> Total derived words: <td> ' . $derived . "\n"; 
print '<tr><td> Total entries: <td> ' . $entry_count . "\n"; 
print "</table>\n"; 
 
 
print '<p>Last modified '; 
print scalar( localtime() ); 
print "\n</body></html>\n"; 

### END



sub make_hyperlink( $ ) {
    my $linkword = shift;
    my $anchortext = $linkword;
    $anchortext =~ s/N/__MM__/g;
    $anchortext =~ s/A/__EE__/g;
    $anchortext =~ s/'//g;
    my $link = qq(<a href="#$anchortext">$linkword</a>);
    return $link;
}

sub linkify ( $ ) {
    my $arg = shift;
    my $linkified_text = "";
    # split line, saving delimiters and words in alternate result fields
    my @words = split(/([\s!?.{}\[\],;:~]+)/, $arg );
    for ( my $i = 0; $i < scalar(@words); $i++) {
	##print "line $line word $i: " . $words[$i] . "\n";   ##DEBUG
	
	
	if ( grep {  $words[$i]  eq $_ } @lexicon_entries ) {
	     $words[$i]  = &make_hyperlink(  $words[$i]  );
	} elsif (  $words[$i]  =~ /'/ ) {
	    #print STDERR "found compound not in lexicon:  $words[$i]  \n";
	    my @morphemes = split( /(')/,  $words[$i]  );
	    my $linkedword = "";
	    my $j = 0;
	    for ( $j = 0; $j < scalar(@morphemes); $j++ ) {
		##print "line $line word $i morpheme $j: " . $morphemes[$j] . "\n";   ##DEBUG
		# check for match in nxcgtx array
		if ( grep { $morphemes[$j] eq $_ } @lexicon_entries ) {
		    $linkedword .= &make_hyperlink( $morphemes[$j] );
		} else {
		    $linkedword .= $morphemes[$j];
		}
		    if ( $j < (scalar(@morphemes) - 1) ) {
			$linkedword .= "'";
		    }
	    } ## end for each morpheme in compound
	    $words[$i]  = $linkedword;
	}
	$linkified_text .=  $words[$i];
    } ## end for each word on nonblank line

    return $linkified_text;
}

sub make_anchor( $ ) {
   	my $anchortext = shift; 
	my $s = "'";
	$anchortext =~ s/$s//g; 
 	$anchortext =~ s/ /___/g; 
	$anchortext =~ s/Z/zh/g;
	$anchortext =~ s/S/sh/g;
	$anchortext =~ s/G/gh/g;
	return $anchortext;
}

sub unicode_convert( $ ) {
    my $text = shift;
    $text =~ s/Z/zh/g;
    $text =~ s/S/sh/g;
    $text =~ s/G/gh/g;
    $text =~ s/N/&#x14B;/g;
    $text =~ s/A/&#xE4;/g;
    return $text;
}

sub fix_protected( $ ) {
    my $text = shift;
    $text =~ s/__EE__/A/g;
    $text =~ s/__MM__/N/g;
    return $text;
}
