

# Filter out strings in input that are too similar to each other.  To
# be output, a string must differ by at least two characters from any
# string encountered so far.  So if e.g. |kak|, |kat|, |kek|, and |tak|
# all occur in the input, only the first one encountered will be output.


use strict;

my $TRUE = 1;
my $debug = 0;
my $min_allowed_diff = 2;

my $forbid_prefix = 0;
my $forbid_suffix = 0;
my $forbid_substring = 0;
my $substring_check_on = 0;
my $substring_check_only = 0;

my $arg;
while ( $arg = shift ) {
    print "processing \$arg == $arg\n" if $debug;
    if ( $arg eq "-d" ) {
	$debug = shift;
	print "setting \$debug to $debug\n"  if $debug;
    }
    if ( $arg eq "-m" ) {
	# if this is set to 1 it works like "uniq" but doesn't require
	# input to be sorted
	$min_allowed_diff = shift;
	print "setting \$min_allowed_diff to $min_allowed_diff\n"  if $debug;
    }
    ###TODO: arg specifying an input file of pre-included strings,
    # all good by definition even if some are not redundant enough with 
    # each other...

    if ( $arg =~ m/^-n([psi])$/ ) {
	print "processing $arg with $1\n"   if $debug;
	if ( $1 eq "p" ) {
	    $forbid_prefix = $TRUE;
	    $substring_check_on = $TRUE;
	} elsif ( $1 eq "s" ) {
	    $forbid_suffix = $TRUE;
	    $substring_check_on = $TRUE;
	} elsif ( $1 eq "i" ) {
	    $forbid_substring = $TRUE;
	    $substring_check_on = $TRUE;
	}
	print " \$forbid_substring == $forbid_substring, \$forbid_prefix == $forbid_prefix,\n \$forbid_suffix == $forbid_suffix, \$substring_check_on == $substring_check_on\n"    if $debug;
    }

    if ( $arg eq "-o" ) {
	print "arg -o so will only check for substring matches\n"   if $debug;
	$substring_check_only = $TRUE;
    }

    ###TODO display_usage func to show args if any arg not recognized
}

if ( $substring_check_only  and  ! $substring_check_on ) {
    $forbid_substring = $TRUE;
    $substring_check_on = $TRUE;
}

my $input_line;
my @good_strings;
my $s;
my $thisdiff;
GETLINE: while (defined ( $input_line = <STDIN> ) ) {
    chomp $input_line;
    $input_line =~ s/\r//g;
    if ( scalar( @good_strings ) == 0 ) {
	push @good_strings, $input_line;
	next;
    }
    my $mindiff = 9999;
    COMPARE: foreach $s ( @good_strings ) {
	if ( $substring_check_on and &check_substrings( $s, $input_line ) ) {
	    print "forbidden substring, should skip this [$input_line]\n"   if $debug;
	    next GETLINE;
	}

	if ( $substring_check_only ) {
	    next COMPARE;
	}

	$thisdiff = &chardiff( $s, $input_line );
	print $s . "\t" . $input_line . "\t" . $thisdiff . "\n"   if $debug;
	if ( $mindiff > $thisdiff ) {
	    $mindiff = $thisdiff;
	}
    }

    if ( $substring_check_only  or  $mindiff >= $min_allowed_diff ) {
	push @good_strings, $input_line;
	print $input_line . "\n";
    }
}

#foreach $s ( @good_strings ) {
#    print $s . "\n";
#}


############################################################


sub check_substrings {
    my ($first, $second) = @_;
    my $longer;
    my $shorter;
    if ( length( $first ) >= length( $second ) ) {
	$longer = $first;
	$shorter = $second;
    } else {
	$longer =  $second;
	$shorter = $first;
    }

    print "&check_substrings( $longer, $shorter )\n"    if $debug >= 2;

    # the first check includes both the latter, so return early if it fails.
    if ( $forbid_substring ) {
	if ( index( $longer, $shorter ) >= 0 ) {
	     return $TRUE;
	}
	return;
    }

    # These two checks are independent, so do the latter if the former fails.
    if ( $forbid_prefix ) {
	if ( index( $longer, $shorter ) == 0 ) {
	     return $TRUE;
	}
    }
    if ( $forbid_suffix ) {
	if ( index( $longer, $shorter ) == ( length($longer) - length($shorter) ) ) {
	     return $TRUE;
	}
    } 
}

sub chardiff {
    my ($first, $second) = @_;
    my ($longer);
    my ($shorter);
    my $diff = -1;
#    if ( length( $first ) == length( $second ) ) {
#	return &parallel_diff( $first, $second );
#    } elsif ( length( $first ) > length( $second ) ) {
    if ( length( $first ) >= length( $second ) ) {
	$longer = $first;
	$shorter = $second;
    } else {
	$longer =  $second;
	$shorter = $first;
    }
    
    my $try_pdiff = &parallel_diff( $longer, $shorter );

    print "chardiff $longer, $shorter\n" if $debug;
    my $substrlen;
    my $offset;
    my $match_offset = -1;
SUBSTRING: for ( $substrlen = length( $shorter); $substrlen => 1; $substrlen-- ) {
OFFSET:     for ( $offset = 0; $offset <= length( $shorter) - $substrlen; $offset++ ) {
	    print "\$substrlen $substrlen, \$offset $offset\n" if $debug;
	    $match_offset = index( $longer, substr( $shorter, $offset, $substrlen));
	    if ( $match_offset >= 0 ) {
		# check for match of this substring against the longer
		# string.  if found, start parallel check at that
		# point and break from loop.
		print "\$match_offset = $match_offset\n" if $debug;
		$diff = $match_offset;
		last SUBSTRING;
	    }
	}
    }

    my $calc_diff = -1;
    if ( $diff < 0 ) {
	# no characters from the short string were found in the long one.
	print "branch 1\n" if $debug;
	$calc_diff = length( $longer );
    } else {
	print "branch 2\n" if $debug;
	my $pdiff = &parallel_diff ( substr($longer, $match_offset), substr( $shorter, $offset ));

	# Both versions of this line have problems.   
	# Version 1 treats "ABCD" ~ "RABCDE" correctly.
#	$calc_diff = $pdiff + $diff + ( length( $longer ) - length( $shorter ) - $diff );
	# Version 2 treats "muka" ~ "kuku" correctly.
#	$calc_diff = $pdiff + $diff + ( length($longer) - length($shorter) );
	# Neither handles both correctly.

	# This version handles the above cases correctly but fails on e.g. "serak" ~ "ragak"
	# searches for "ra" in "serak", then does pdiff on "rak" vs "ragak".
	# Then adds 1 to 2 to (5 - 2) - (5 - 0) = 1...
#	$calc_diff = $pdiff + $diff + ((length($longer) - $match_offset) - (length($shorter) - $offset));
	
	# This should work better:
	$calc_diff = $pdiff + $diff + abs((length($longer) - $match_offset) - (length($shorter) - $offset));
    }
    print "\$calc_diff [$calc_diff] \$try_pdiff [$try_pdiff]\n"   if $debug;

    if ( $calc_diff > $try_pdiff ) {
	return $try_pdiff;
    } else {
	return $calc_diff;
    }
}


sub parallel_diff {
    my ($first, $second) = @_;

    # we know each is same length. simply compare each char 
    # of each string.  add 1 to return val for each non-identical char.
    my $ret = 0;
    for ( my $i = 0; $i < length( $first ); $i++ ) {
	if ( substr( $first, $i, 1 ) ne substr( $second, $i, 1 ) ) {
	    $ret++;
	}
    }
    print "parallel_diff( $first, $second ) == $ret\n" if $debug;
    return $ret;
}
