#!/usr/bin/perl

# Read a text file and make it a frameset HTML document set where every word is linked to its
# gloss in the sAb zjeda lexicon

# (c) 2008 Jim Henry.  Creative Commons licensing (not that it will
# be useful to anybody else as-is, but feel free to copy and modify)

use strict;

sub unicode_convert( $ );
sub anchor_convert( $ );
sub write_frameset( $$ );
sub make_hyperlink( $ );
sub fix_protected( $ );


# read command line args
my $infile;
my $outfile_glosslinks;
my $outfile_frameset;

# default arg value unless overridden with -l option
my $lexicon = "c13_lexicon-new.txt";

# default document title unless overriden by -t option
my $title = "gloss-links file generated by gen-glossed-doc-frameset.pl";
my $arg;

while ( $arg = shift ) {
    if ( $arg eq "-o" ) {  ## output filename
	my $outbase = shift;
	$outfile_frameset = $outbase . ".html";
	$outfile_glosslinks = $outbase . "_f.html";
    } elsif ( $arg eq "-l" ) { ## lexicon
	$lexicon = shift;
    } elsif ( $arg eq "-t" ) { ## document title - all of command line from here on becomes part of title
	$title = unicode_convert( join( " ", @ARGV ) );
    } else {
	if ( not defined $infile ) {
	    $infile = $arg;
	} else {
	    ###TODO write usage
	}
    }
}

# set output file names based on input file name or other args overriding it

if ( not defined $outfile_frameset ) {
    my $infile_base = $infile;
    $infile_base =~ s/\.txt$//;
    $outfile_frameset = $infile_base . ".html";
    $outfile_glosslinks = $infile_base . "_f.html";
}

# slurp in lexicon file, filtering out unwanted lines
open LEXICON, $lexicon;
my @lexicon_words;
while ( <LEXICON> ) {
    my @f = split /\t/;
    if ( $f[4] !~ /^ *$/ ) {
	push @lexicon_words, $f[0];
    }
}
close LEXICON;

###TODO add "or die"  at other file opens

open INFILE, $infile 				or die "can't open $infile for reading\n"; 
open GLOSSFRAME, "> $outfile_glosslinks"	or die "can't open $outfile_glosslinks for writing\n"; 


&write_frameset( $outfile_frameset, $outfile_glosslinks );

# write HTML header to text frame
print GLOSSFRAME<<HEADER;
<HTML>
<HEAD>
<meta name="generator" content="gen-glossed-doc-frameset.pl">
<TITLE>
$title
</TITLE>
<style><!--
  BODY { font-family: "Lucida Sans Unicode", sans-serif }
  A  { text-decoration: none }
--></style>
</HEAD>
</HTML>
<BODY>
<H1>
$title
</H1>

<p>
Click the hyperlinked words and morphemes to get their gloss in the lower frame.
(Links are not underlined here as underlining almost every word on the page is cluttery
and makes characters with descenders hard to distinguish.)

HEADER

my $in_para = 0;
my $p = 0;
my $outline = "";
my $linkcount = 0;
my $word;
my $line = 0;

while ( <INFILE> ) {
    $line++;
    if ( /^\s*$/ && $in_para ) {
	print GLOSSFRAME "</p>\n\n";
	$in_para = 0;
    }

    # if line is not blank
    if ( ! /^\s*$/ ) {

	if ( 0 == $in_para ) {
	    print  GLOSSFRAME  "<p id=\"p" . ++$p . "\">\n";
	    $in_para = 1;
	}
	
	my $origline = $_;  ##DEBUG
	$outline = "";
        # split line, saving delimiters and words in alternate result fields
	my @words = split(/([\s!?.{}\[\],;:~]+)/, $_ );

	# replace < > " & with HTML char entities -- where?

	for ( my $i = 0; $i < scalar(@words); $i++) {
	    ##print "line $line word $i: " . $words[$i] . "\n";   ##DEBUG

	    $word = $words[$i];
	    if ( grep { $word eq $_ } @lexicon_words ) {
                $word = &make_hyperlink( $word );
            } elsif ( $word =~ /'/ ) {
		##print "found compound not in lexicon: $word \n";
	        # if no match found and word contains '-'
		my @morphemes = split( /'/, $word );
		my $linkedword = "";
		my $j = 0;
		for ( $j = 0; $j < scalar(@morphemes); $j++ ) {
		    ##print "line $line word $i morpheme $j: " . $morphemes[$j] . "\n";   ##DEBUG
		    # check for match in nxcgtx array
		    if ( grep { $morphemes[$j] eq $_ } @lexicon_words ) {
			$linkedword .= &make_hyperlink( $morphemes[$j] );
		    } else {
			$linkedword .= $morphemes[$j];
		    }
		    if ( $j < (scalar(@morphemes) - 1) ) {
			$linkedword .= "'";
		    }
		} ## end for each morpheme in compound
		$word = $linkedword;
            }
            $outline .= $word;
	} ## end for each word on nonblank line

	$outline = fix_protected( unicode_convert( $outline ) );

        print GLOSSFRAME $outline . "\n";

    } # end if line is not blank
} # end for each line of input file

# write HTML footer to text frame

my $datestring = localtime();
print GLOSSFRAME<<FOOTER;
<HR />

<p>
<a href="intro.htm" target="_top">Main Conlang #13 index</a>
<br />
Last updated on $datestring
</p>

</body>
</html>

FOOTER

# close files
close;

# write done message
print "done processing $infile\n";

# ===

sub make_hyperlink( $ ) {
    $linkcount++;
    my $word = shift;
    my $anchor = $word;
    $anchor =~ s/'//g;
    $anchor = anchor_convert( $anchor );
    my $link = qq(<a href="c13_lexicon.html#$anchor" target="lexicon">$word</a>);
    return $link;
}

#================

sub write_frameset( $$ ) {
    my $framesetfile = shift;
    my $topframefile = shift;
    my $bottomframefile = "c13_lexicon.html";

    open FRAMESET, "> $framesetfile";
    print FRAMESET<<END;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
   "http://www.w3.org/TR/html4/frameset.dtd">
<HTML>
<HEAD>
<meta name="generator" content="gen-glossed-doc-frameset.pl">
<title>
$title
</title>
</HEAD>

<FRAMESET rows="80%, 20%">
      <FRAME src="$topframefile"    name="text">
      <FRAME src="$bottomframefile" name="lexicon">
</FRAMESET>
<NOFRAMES>
      <P>Since your browser doesn't support frames, click here:
      <br />
      <a href="$topframefile">$topframefile</a>
      </P>
</NOFRAMES>
</FRAMESET>
</HTML>

END

    close FRAMESET;
}


#====================

sub unicode_convert( $ ) {
    my $text = shift;
    $text =~ s/Z/zh/g;
    $text =~ s/S/sh/g;
    $text =~ s/G/gh/g;
    $text =~ s/N/&#x14B;/g;
    $text =~ s/A/&#xE4;/g;
    return $text;
}

sub anchor_convert( $ ) {
    my $text = shift;
    $text =~ s/Z/zh/g;
    $text =~ s/S/sh/g;
    $text =~ s/G/gh/g;
    $text =~ s/N/__MM__/g;
    $text =~ s/A/__EE__/g;
    return $text;
}


sub fix_protected( $ ) {
    my $text = shift;
    $text =~ s/__EE__/A/g;
    $text =~ s/__MM__/N/g;
    return $text;
}
