
# Take two tab-delimited files, one with a lexicon with some
# word-shapes not assigned yet, and one with a set of new
# word-meanings to be assigned to words.  Assume both are sorted
# appropriately (i.e. lexicon sorted with shortest words first and
# meanings sorted with probably most common ones first).  Output
# lexicon with meanings added to words not yet meaning anything.
# Tricky bit: maintaining semantic category mapping.  This means we
# may have to read ahead in one file and then backtrack etc.


use strict;

my $debug = 1;

use constant PERCENT_GAPS => 10;

# Array indices
use constant WANT_ROOT => 2;
use constant CATEG_LEX => 3;
use constant CATEG_MEAN => 1;
use constant DEFINITION_MEAN => 3;
use constant DEFINITION_LEX => 4;
use constant DATE_LEX => 5;

my $lexicon_file = shift;
my $new_meanings_file = shift;
my $unused_meanings_file = $_ ? shift : "meanings-not-used-yet.txt";

open LEXICON, $lexicon_file or die;
open MEANINGS, $new_meanings_file or die;
open UNUSED, ">" . $unused_meanings_file;

srand;

my @lexicon;
my $i = 0;
while ( <LEXICON> ) {
    s/[\r\n]+$//;
    $lexicon[$i++] = [ split "\t" ];
}

$i = 0;
my @meanings;
while ( <MEANINGS> ) {
    s/[\r\n]+$//;
    $meanings[$i++] = [ split "\t" ];
}

# semantic categories of Conlang13
my @index_keys = qw( gp re su pr qu );

my %lexicon_indexes;

# Initialize each index to the first word with a blank def with the right category. 
foreach ( @index_keys ) {
    $lexicon_indexes{ $_ } = 0;
    &advance_counter( $_ );
#    for ( $i = 0; $lexicon[$i][DEFINITION_LEX] ne "" or $lexicon[$i][CATEG_LEX] ne $_; $i++ ) {
#    }
    if ( $debug ) {
	$i = $lexicon_indexes{ $_ };
	print STDERR "setting initial index for $_ to row $i where categ = " . $lexicon[$i][CATEG_LEX] 
	    . " and definition is \"" . $lexicon[$i][DEFINITION_LEX] . "\"\n";
    }

#    $lexicon_indexes{ $_ } = $i;
}

my @date = localtime;
my $datestr = sprintf( "%4d-%02d-%02d", $date[5] + 1900, $date[4] + 1, $date[3] );

for ( $i = 0; $i < scalar(@meanings); $i++ ) {
    if ( $meanings[$i][WANT_ROOT] ne "y" ) {
	print UNUSED join( "\t", @{ $meanings[$i] } ) . "\n";
	next;
    }

#    my $idxref = \( $lexicon_indexes{ $meanings[$i][CATEG_MEAN] } );
    my $idx = $lexicon_indexes{ $meanings[$i][CATEG_MEAN] };
#    print STDERR "index ref $idxref referring to $$idxref\n"   if $debug;
#    $lexicon[ $$idxref ][ DEFINITION_LEX ] = $meanings[$i][ DEFINITION_MEAN ];
#    $lexicon[ $$idxref ][ DATE_LEX ] = $datestr;
    $lexicon[ $idx ][ DEFINITION_LEX ] = $meanings[$i][ DEFINITION_MEAN ];
    $lexicon[ $idx ][ DATE_LEX ] = $datestr;
    &advance_counter( $meanings[$i][CATEG_MEAN] );
#    while ( $lexicon[ $$idxref ][DEFINITION_LEX] ne "" or $lexicon[ $$idxref ][CATEG_LEX] ne $meanings[$i][CATEG_MEAN]  ) {
#        ++$$idxref;
#    }
}

foreach ( @lexicon ) {
    print join( "\t", @{ $_ } ) . "\n";
}

##################################################

sub advance_counter {
    my $categ = shift;
    my $idxref =  \( $lexicon_indexes{ $categ } );
    while ( 1 ) {
	while ( $lexicon[ $$idxref ][DEFINITION_LEX] ne "" or $lexicon[ $$idxref ][CATEG_LEX] ne $categ  ) {
	    ++$$idxref;
	}
	last if ((rand 100) > PERCENT_GAPS);
	# skip this usable space, leave it to be filled later
	++$$idxref;
    }
}
