#!/usr/bin/perl

# wordgen.pl -- generate words for a conlang or
# names for gaming.
#
# Written by Benct Philip Jonsson <bpj@melroch.se> 2009.
# Last revised 10 jul 2009
# (Translated Swedish comments, added comments and POD).
#
# Partly based on everyword.pl written by John Cowan 2003
# and revised by Jim Henry 2006.
#
# Use or modify at will as long as you leave these lines in!
#
# This script takes as input rules in a file or on STDIN.
# See below the script for rule syntax.

use strict;
use warnings;
use utf8;
use open ':utf8';
use open ':std';
use Getopt::Long qw/:config bundling/;

# Option variables with default values
my $template = 'WORD';      # The template to use
my $number = 15;            # The number of words to generate
my $fails = 1000;           # allow how many failed attempts to generate a good word?
my $caps = 0;               # Should output words be capitalized?
my $sorted = 0;             # Should output be sorted?
my $debug_exclusions = 0;
my $debug_templates = 0;
my $debug_classes = 0;

exit if not GetOptions(
   "template|t=s"          =>  \$template,
   "number|words|n|w=i"    =>  \$number,
   "fails|fail-limit|f=i"  =>  \$fails,
   "caps|capitalize|c"     =>  \$caps,
   "sort-output|s"         =>  \$sorted,
   "debug-exclusions"      =>  \$debug_exclusions,
   "debug-templates"       =>  \$debug_templates,
   "debug-classes"         =>  \$debug_classes,
);

my $debug = 1 if $debug_exclusions;
$debug = 1 if $debug_templates;
$debug = 1 if $debug_classes;
$debug = 1 if $debug_exclusions;

if($debug){
   use Data::Dumper;
   $Data::Dumper::Deepcopy = 1;
   $Data::Dumper::Sortkeys = 1;
}


my %classesh;
my @exclusions;
my %templatesh;
my @template_tokens;

while(<>){
   chomp;
   my $line = $_;
   $line =~ s/\0//g;                   # Remove null characters if any
   $line =~ s/#.*//;                   # Remove comments
   next unless $line =~ /\S/;          # Skip empty lines
   $line =~ s/^\s+//;                  # Remove leading whitespace
   $line =~ s/\s+$//;                  # Remove trailing whitespace
   my @tokens = split /\s+/, $line;    # Split line into tokens
   my $name = $tokens[0];
   if($name =~ /^\w+:$/)               # It's a class
   {
       $name =~ s/:$//;                # Remove the colon
       shift @tokens;                  # Remove the name from @tokens
       # $classesh{$name} = [];        # Class definitions are not cumulative
       expand_class($name,\@tokens);
   }
   elsif($name eq '!')                 # It's a list of exclusion patterns
   {
       shift @tokens;                  # Throw away the '!'
       push @exclusions, @tokens;      # Add these exclusions to the list of exclusions
   }
   elsif($name =~ /^\w+=$/)            # It's a template
   {
       $name =~ s/=$//;                # Remove the equals
       shift @tokens;                  # Remove the name from @tokens
       @template_tokens = @tokens;
       $templatesh{$name} = expand_template();
   }
}

if($debug_templates){
       $Data::Dumper::Varname = "templates";
       print Dumper \%templatesh;
   }
if($debug_classes){
       $Data::Dumper::Varname = "classes";
       print Dumper \%classesh;
   }
if($debug_exclusions){
       $Data::Dumper::Varname = "exclusions";
       print Dumper \@exclusions;
   }

unless(exists $templatesh{$template}){
   die "There is no template $template!\n";
}

my %wordsh;


while(scalar keys %wordsh < $number and $fails)
# Keep generating words while there aren't many enough
# and we haven't failed to get a good word too many times.
{
   my $word = generate($templatesh{$template}); # Generate from the choosen template
   if($caps){ $word =~ s/(\p{L}+)/\u$1/g }      # Should we capitalize?
   # Throw away the word and decrement the fail counter
   # if the word contains an illegal pattern or if it already exists
   if(grep($word =~ /$_/i, @exclusions) or exists $wordsh{$word}){ $fails-- }
   else { $wordsh{$word} = 1 } # Save the word
}

my @words = keys %wordsh;
if($sorted){ @words = sort @words }

print join("\n", @words),"\n";

sub expand_class {
   my $name = $_[0];
   my @tokens = @{ $_[1] };
   my $item;
   foreach $item ( @tokens ) {
   if ( exists( $classesh{ $item } ) ) {
       expand_class( $name, $classesh{ $item } );
   } else {
       $item =~ s/\\_/ /g; # "\_" is converted into a whitespace
       $item =~ s/\\(.)/$1/g;  # Remove escaping backslashes
       push @{ $classesh{$name} }, $item;
   }
   }

}

sub expand_template {
   my $item;
   $item = [];     # Will hold a list of items
   while(@template_tokens){
       my $token = shift @template_tokens;
       if($token =~ /^ ( [\>\]\)] ) ( \{ (\d*)? (?: , (\d+) )? \} )? $/x)
       # If it is a closing bracket with or without a following quantifier.
       # I.e. "]" OR ")" OR ">" OR "]{1,2}" etc. OR "]{2}" etc. OR "]{,2}" etc.
       # Where
       # * $1 is the bracket closing an item
       # * $2 is the quantifier if any
       # * $3 is the minimum number of instances of the list
       # * $4 is the maximum number of instances of the list
       {
           # Make sure to return an empty string if $item has no items
           if(not @$item){ $item = '' }
           if($1 eq ']'){ $item = ["\0CHOICE", $item] } # It's a choice list
           # The null character is there to make sure flag differs from any
           # item "CHOICE" the user may supply.  That's why we stripped
           # any null characters from the input.  The word "CHOICE" is there to
           # make sure something is visible on the termidnal if we use --debug_templates.
           elsif($1 eq ')'){ $item = [ "\0CHOICE",[ '', $item ] ] } # It's an optional item
           # Internally an optional item is a choice list between an
           # empty string and another item.
           elsif($1 eq '>'){} # '>' is just a trigger to return $item
           if($2)  # If there is a quantifier
           {
               my $opt = $4 ? $4-$3 : 0;   # Calculate the number of optional instances
               # Return the correct number of obligatory and optional instances of $item
               return (($item) x $3,( [ "\0CHOICE",  [ '', $item ] ] ) x $opt);
           }
           else { return $item }   # There is no quantifier so just return $item as is
       }
       elsif($token =~ /^(?:\[|\(|\<)$/)   # It is any opening bracket!
       { push @$item, expand_template() }  # start a new list inside the current list
       elsif($token eq '\\_'){ push @$item, ' ' } # "\_" makes a whitespace
       elsif(exists $classesh{$token}){ push @$item, [ "\0CHOICE", [ @{ $classesh{$token} } ] ] }
       elsif(exists $templatesh{$token}){ push @$item, $templatesh{$token} }
       else {
           # Remove escaping backslashes
           # Any input backslash followed by anything other than whitespace
           # only makes sure whatever follows is interpreted literally
           # and is itself deleted
           $token =~ s/\\(.)/$1/g;
           push @$item, $token;    # Add $token to the current list/item
       }
   }
   return $item; # If there are no more items in @template_tokens
}


sub generate {
   my $template = shift;   # Get the template or sub-template list
   my $word;   # Holds the output
   if(ref $template eq 'ARRAY' and $$template[0] eq "\0CHOICE")
   # If it's a choice list
   {
       my @array = @{$$template[1]}; # Get the list of items to chose from
       # Pick a random item and process it recursively
       $word .= generate($array[rand @array]);
   }
   elsif(ref $template eq 'ARRAY') # If it's an ordinary "< a b c >" list
   # Go through each item, process it recursively and add the return value to the output
   { for my $item (@$template){ $word .= generate($item) } }
   # If the 'template is just a scalar value add it to the output
   if(not ref $template){ $word .= $template }
   return $word;   # Return the output
}

__END__



=pod

---------- Forwarded message ----------
From: Benct Philip Jonsson <bpj@melroch.se>
Date: Fri, Jul 10, 2009 at 11:28 AM
Subject: OFFLIST: My word generator
To: Jim Henry <jimhenry1973@gmail.com>


Hi!

I thought this may be of interest, especially the template syntax (which I also ultimately didn't come up with myself!)
I wrote it to roplace everyword.pl since I was troubled by
the enormous output volume the latter gave with anything but
very restricted word shape possibilities.

I send it offlist for fear of exposing my amateur code...

There is quite extensive POD describing the rule syntax
at the bottom.

P.S. I'll be offline Monday to Wednesday

/BP

=cut


=pod

=head1 NAME

B<wordgen.pl> - generates words for a conlang or names for gaming.

=head1 USAGE

 perl wordgen.pl [ OPTIONS ] rules-file

 OR

    perl wordgen.pl [ OPTIONS ]

    <rules>

    <Ctrl-D>

=head1 DESCRIPTION

The input to B<wordgen.pl> consists of rules for forming words typed
either in a file or on STDIN. The output is word shapes generated
according to the rules.

=head1 OPTIONS

=over

=item *

E<lt>--template | -tE<gt> E<lt>stringE<gt>

The template to use

=item *

E<lt>--number | --words | -n | -wE<gt> E<lt>integerE<gt>

The number of words to generate

=item *

E<lt>--fails | -fail-limit | -fE<gt> E<lt>integerE<gt>

Allow how many failed attempts to generate a good word?

(We don't want to get into an infinite loop of failed attempts!)

=item *

E<lt>--caps | -capitalize | -cE<gt>

Should output words be capitalized?

=item *

E<lt>--sort-output | -sE<gt>

Should output be sorted ASCIIbetically?

=item *

E<lt>--debug-exclusionsE<gt>

Dump the array holding the exclusions to the terminal.

=item *

E<lt>--debug-classesE<gt>

Dump the associative array holding the classes to the terminal

=item *

E<lt>--debug-templatesE<gt>

Dump the associative array holding the templates to the terminal.

=back

=head1 RULES

=head2 Kinds of rules

There are three kinds of rules:

=over

=item * B<Classes:> lists of graphemes and grapheme combinations which
may be used in template rules.

=item * B<Templates:> patterns describing how graphemes may be combined
into words.

=item * B<Exclusions:> regular expression patterns which must not match
on generated words.

=back

Rule files may also contain comments. Comments start with a # and
continue to the end of the line, like in Perl.

Each rule occupies one line

Empty lines (and lines with only whitespace) are ignored.

Rules can come in any order, but since earlier rules can be included in
later rules

=over

=item * Classes can be included in other classes

=item * Classes can be included in templates

=item * Templates can be included in other templates

=back

The relative order of rules matters. Generally you would define

=over

=item * classes included in other classes before the classes they are
included in.

=item * classes before templates.

=item * templates included in other templates before the templates they
are included in.

=back

=head2 Rule syntax

=head2 Classes

Class rules start with a I<classname> (any number of letters, digits or
underscores) followed by a colon C<:>. The class members are a list of
graphemes or grapheme combinations separated by whitespace:

 V: a e i o u # Vowels
 C: p t k b d g ph th th kh ch j s sh h m n ng r l y w # Consonants

A class may include previously defined classes; one may for example
have the following rules:

 P:  p t k               # Voiceless plosives
 B:  b d g               # Voiced plosives
 PH: ph th kh            #Aspirated plosives
 AF: ch j                # Affricates
 S:  s sh                # Sibilants
 N:  m n ng              # Nasals
 L:  r l                 # Liquids
 Y:  y w                 # Semivowels
 C:  P B PH AF S h N L Y # Consonants

Which would result in a class "C" identical to the one above. Note that
class names and ordinary graphemes can be mixed with eachother ("h"
above)!

This come in useful where the same classes of graphemes recur as
members of several larger classes:

 InitC:  P B PH AF S h N L Y # Word-initial consonants
 FinC:   P S N L             # Word-final consonants

Another way class inclusion may be used is for weighting. A rule like

 InitC: P P P B B PH AF AF S S h N N L Y     # Word-initial consonants

Will make members of class "P", trice as likely and members of the
classes "B", "AF", "S" and "N" twice as likely to occur as initial
consonant as members of the classes "PH", "L" and "Y" or the letter
"h".

(Of course you can do weighting with simple graphemes too; a vowel rule
like

 V: a a i i u u e o

will make any of "a i u" twice as likely to occur as any of "e o".)

If you specify several class rules with the same name the members of
each instance are added to those already defined. This I<is> useful if
you want to define a very large class but not want very long lines in
your file:

 C:  p t k               # Voiceless plosives
 C:  b d g               # Voiced plosives
 C:  ph th kh            #Aspirated plosives
 C:  ch j                # Affricates
 C:  s sh                # Sibilants
 C:  m n ng              # Nasals
 C:  r l                 # Liquids
 C:  y w                 # Semivowels

 # Same as

 C: p t k b d g ph th th kh ch j s sh h m n ng r l y w # Consonants

 # but you may comment out some subclass if needed!

If you want a space character as a member of a class type C<\_>
(backslash + underscore). A backslash followed by any other
non-whitespace character is interpreted as an escaping backslash: it
makes sure that whatever follows it is interpreted literally. Of course
a backslash can be escaped by another backslash, so C<\\_> gives a
literal C<\_>, should you want that. If you want any class member to
contain an actual backslash followed by some other character you have
to type a double backslash C<\\> .

 ------------------------------------------------------------
 The string      is interpreted as
 ----------      --------------------------------------------
 \               \

 \\              \

 \\              space character

 \\_             \_

 C               The class "C" if such a class exists, else
                the letter C.

 \C              The letter C, whether there is a class
                "C" or not.
 ------------------------------------------------------------

=head2 Templates

A template rule starts with a I<templatename> (any number of letters,
digits or underscores) followed by an equals sign C<=>. The template is
a list which may contain four kinds of elements:

=over

=item * literal grapheme strings.

=item * a class name.

=item * a I<subtemplate>, which recursively may contain the same kinds
of elements as a top-level template.

=back

=head2 Kinds of subtemplates

There are three kinds of subtemplates:

Type

Example

Output

Sequence

'abc'

Optional

( a b c )

'abc' or nothing

Choice

[ a b c ]

'a' or 'b' or 'c'

It is useful to think of certain patterns as shorthands for certain
other patterns (where C<E<lt> E<gt>> is an empty sequence, i.e.
I<null>; it outputs an empty string:

 -----------------------------------------------------------
 Pattern         is short for
 -----------------------------------------------------------
 ( a b c )       [ < > < a b c > ]

 ( [ a b c ] )   [ < > [ a b c ] ]

 ( < a b c > )   [ < > < < a b c > > ]

                 # In effect a version of ( a b c ) which
                 # takes longer to process, and, thus rather
                 # meaningless.

 [ < a b c > ]   < a b c >    but slower to process

 [ a ]           a
                # I.e. optionals are meaningless unless
                # they contain at least two immediate
                # subelements!

 C               [ p t k ]
                # if there is a class rule
                #    C: p t k
                # already defined!
 -----------------------------------------------------------

Obviously a I<sequence> is only interesting if it contains elements
which are themselves are I<optionals> or I<choices>, or if it is
followed by a I<quantifier> (for which see below):

 < ( s ) ( [ p t k ] ) >     # outputs any of
                            # '', 's', 'p', 't', 'k', 'sp', 'st', 'sk'

 [ < > s p t k < s [ p t k ] > ]

 # Outputs the same but probably with a more natural weighting,
 # since any of the singleton consonants or nothing is as likely
 # to be output an s + p/t/k .

Note that anything enclosed in C<( )> has only a 50% chance of being
output at all. Note the difference:

 ( [ p t k ] )   # Each of p, t, k has a 16.7% chance of appearing in the output.

 [ < > p t k ]   # Each of p, t k, has a 25% chance of appearing in the output.

Thus while C<( )> is useful it may be too powerful!

Note that a template element corresponding to a literal string not
enclosed in C<( )> or C<[ ]> will B<always> appear in the output!

 SYLL= C V s     # All syllables will end in 's'
                # -- unless there is a class "s"!

There is a convention that class or template names consist of or
contain capitals, and although it is useful in the general case where
output words are not to contain any capitals, but it is not enforced: a
class or template name may contain any characters matching your perl
versions definition of C</\w+/>!

=head2 Quantifiers

The bracket closing a subtemplate may be imediately (without any
intervening whitespace) followed by a quantifier similar to a Perl
regular expression quantifier:

 < x >{2,5}  =   < x x ( x ) ( x ) ( x ) >

 # at least 2 and at most 5 instances of 'x':
 # 'xx' OR 'xxx' OR 'xxxx' or 'xxxxx'

Again note that C<( )> is rather powerful:

 ( x ){1,2}  =   < [ < > < x > ] [ < > [ < > < x > ] ] >

 # The odds for a second x to actually appear is rather low!
 # Do you want to calculate the odds?

Obviously quantifiers are most useful after C<E<lt> E<gt>> or C<[ ]>
subtemplates for specifying the number of syllables in a word:

 < C V ( C ) >{1,3}          # One to three CV or CVC syllables.

 [ < C V > < C V C > ]{1,3}  # The same, but with more complicated
                            # subtemplates there would probably be
                            # a difference in weighting!

Note that an outer pair of C<E<lt> E<gt>> around a top-level template
are useless unless followed by a quantifier, since such an outer pair
is so to speak always inserted automatically!

=head2 Templates inside templates

A previously defined top-level template may be included as a
subtemplate in another template by specifying its name as an element of
the other template:

 SYLL=   C V ( C )
 WORD=   < SYLL >{0,3} C V # = < C V ( C ) >{0,3} C V

 ! ([aeiou])\1  (yi|wu)[aeiou]
 SYLL=   C ( [ i u ] ) V ( [ m n ng i u ] )
 NAME=   SYLL < SYLL >{2}    # A Chinese style name?

Obviously this is most useful if the included template is more
complicated!

=head2 Whitespace in templates

Whitespace in template definitions is significant. In particular there
I<must> be whitespace around brackets delimiting subtemplates, while
there must be no whitespace between a bracket closing a subtemplate and
a quantifier or inside a quantifier:

 This sequence of chars      outputs this
 --------------------------  ------------------------
 [ a b ]                     'a' or 'b'
 [a b]                       '[ab]' # sic!
 [ ab ]                      'ab'
 < a b >{1,2}                'ab' or 'abab'
 < a b > {1,2}               'ab{1,2}' # sic!
 [ a b]                      # Garbles your template!
 < a b>{1,2}                 # Garbles your template!
 < a b>{1, 2}                # Garbles your template!

 All kinds of brackets work the same way!

=head2 Exclusions

A list of exclusions is a line starting with an exclamation point,
followed by Perl regular expressions separated by whitespace:

 ! ^[mnysh][rl] ^[rl][^aeiouy]   # /^[mnysh][rl]/ and /^[rl][^aeiouy]/
 ! (.)\1                         # Nothing may be doubled

Note that many exclusions slow down the script significantly! Thus if
only certain combinations of characters are permitted at certain
positions in a word define classes containing those combinations as
members and use the classes in your template, rather than a more
general template and a lot of specific exclusions. Cf. the example file
below.

=head1 AUTHOR

Benct Philip Jonsson bpj at melroch dot se

Thanks are due to:

=over

=item * John Cowan who wrote everyword.pl and Jim Henry who modified
the same, since I borrowed the syntax and most of the code for classes
and exclusions from them.

=item * The poster to the CONLANG mailing list who wrote a generator in
PHP which used a similar template syntax.

=back

=head1 Example rules file

 # west.wgn -- a wordgen.pl rules file which attempts to generate
 # words and names similar to the Germanic component of English.
 #
 # It actually looks like someone not knowing Middle Cornish trying
 # to write something which looks like Middle Cornish...

 # Classes
 CI: b c d f g h j k l m n p r s t v w y ch th sh qu ph
 V1: a e i o u y
 V2: ai au aw ay ea ee ei ey ie oa ou ow # eo ew oi oy ui uy
 V: V1 V1 V1 V2 V2 eo ew
 E: y e ey le er s es ies
 NE: y e ey le er ing ling s es ies
 SC: sl sm sn sw squ spr str scr spl
 CR: br cr dr fr gr pr tr wr chr thr shr phr
 CL: bl cl fl gl pl ch
 TW: tw thw dw
 CF: b d ff g ck m n p r s t x ch th sh gh
 RC: rb rd rf rg rk rl rm rn rp rr rs rt rx rch rth rsh rsp rst rsk
 LC: lb ld lf lg lk ll lm ln lp ls lt lx lch lth lsh lst lsk
 NC: mb mp nd nt ng nk mn nx
 SP: sp st sk
 CT: ft ght xt

 # Exclusions
 ! phuck fuck shit cunt
 ! \b\S\b \b\S\S\b
 ! [^aeiouy]([^aeiouy])\1 eey?\b  ue eu
 ! \bi[aeouy] [aeiou]{3} [aeiou]{2}[yw] ([aiuyw])\1 [ao]u\b
 ! (x|ch|sh)s\b
 ! [^iuyw]gh [^aeo][uw]gh
 ! (^|[^c])k[aou] c[iey]
 ! [rln]x[^aeiouy]

 # Templates
 SYLL= ( [ CI CR CL SC TW ] ) V [ < > CF CF RC RC LC NC NC SP CT ]
 WORD= SYLL ( ( e ) SYLL ) (  E )
 NAME= SYLL ( ( e ) SYLL ) (  NE  )
 FULLNAME= < WORD \_ >{1,2} NAME

=cut
