
# Convert gjax-zym-byn ASCII-transcription text from input file into
# Unicode HTML entities in output.  Some of the unicode mappings are
# not really ideal.

# (c) 2004-2005 Jim Henry.  Creative Commons licensing (not that it will
# be useful to anybody else as-is, but feel free to copy and modify)



BEGIN {
  litero["ax"] =  "\\&#226;";
  litero["cq"] =  "\\&#269;"; 
  litero["cx"] =  "\\&#265;";  
#  litero["dx"] =  "\\&#273;";  # d with crossbar, not very distinctive in Lucida Sans Unicode
  litero["dx"] =  "\\&#240;";   # small eth
  litero["eq"] =  "\\&#283;"; 

  litero["fx"] =  "\\&#8355;";  # capital F with extra crossbar; franc symbol
  litero["gx"] =  "\\&#285;";
  litero["hq"] =  "\\&#295;"; # planck's constant sign
  litero["hx"] =  "\\&#293;";

  litero["iq"] =  "\\&#301;"; # i-breve.  There is no i-caron available.
  litero["ix"] =  "\\&#238;";
 
#  litero["jq"] =  "\\&#308;";      # capital J^ - not good
  litero["jq"] =  "\\&#669;";      # j with crossed  tail (IPA for this sound)
  litero["jx"] =  "\\&#309;";
  litero["kq"] =  "\\&#1036;";  # cyrillic
  litero["kx"] =  "\\&#311;";   # k-cedilla
  litero["lq"] =  "\\&#321;";   # 'LATIN CAPITAL LETTER L WITH STROKE' 

  litero["mq"] =  "\\&#625;";   # m with hook (is to m as eng is to n)
  # IPA letter for voiced labiodental nasal

  litero["nq"] =  "\\&#328;";
  litero["nx"] =  "\\&#331;";

  litero["ox"] =  "\\&#244;"; 
  litero["oq"] =  "\\&#466;"; 

  litero["px"] =  "\\&#934;";   # capital Phi
###  litero["pq"] =  "\\&#664;";   # IPA bilabial click
  litero["pq"] =  "\\&#421;";   # lowercase p-hook

  litero["rx"] =  "\\&#x155;"   # r-acute
  litero["rq"] =  "\\&#345;";   # r-hacek

  litero["sq"] =  "\\&#353;";   # s-hacek
  litero["sx"] =  "\\&#349;";   # s-circumflex

  litero["tq"] =  "\\&#357;";  # t-apostrophe
###  litero["tx"] =  "\\&#8364;"; # euro sign
#  litero["tx"] =  "\\&#359;"; # lowercase t with extra cross bar
  litero["tx"] =  "\\&#x3B8;"; # lowercase theta

  litero["ux"] = "\\&#365;";    # u-breve, not used in gzb but some
                                # Esperanto text is included in gzb files

###  litero["vx"] = "\\&#1509;";  # something from hebrew
  litero["vx"] = "\\&#436;";  # lowercase y-hook

  litero["zq"] = "\\&#382;";   # z-hacek 
  litero["zx"] = "\\&#378;";   # z-acute
}

{
  # mask English words so they are not affected
  gsub ( /que/, "_QU_e" );
  gsub ( /qua/, "_QU_a" );
  gsub ( /qui/, "_QU_i" );
  gsub ( /quo/, "_QU_o" );
  gsub ( /ffix/, "__FFX__" );   # suffix,. ....
  gsub ( /ixe/, "__I_X__e" );
  gsub ( /ixi/, "__I_X__i" );
  gsub ( /mix/, "m__I_X__" );
  gsub ( /axi/, "__A_X__i" );   # axis, axial, ...
  gsub ( /syntax/, "synt__A_X__" );
  gsub ( /txt/, "_T_X_T_" );    # ASCII file extension
  gsub ( /pobox/, "pob__O_X__" );   # pobox.com, my PURL provider
  gsub ( /oxe/, "__O_X__e" );   #oxen,...
  gsub ( /oxi/, "__O_X__i" );   #oxidize....
  gsub ( /oxy/, "__O_X__y" );   #oxygen...
  gsub ( /uxi/, "__U_X__i" );   #auxiliary
  gsub ( /uxu/, "__U_X__u" );   # luxury
  gsub ( /anxi/, "a__N_X__i" );   # anxiety, anxious, ...

  # TO DO: make it sensitive to not do this conversion on quoted text
  # within tags <a href="" > etc ... will be tricky since such tags may
  # span lines; may have to convert to Perl to do it right

  for ( xlit in litero ) {
    gsub ( xlit, litero[ xlit ] );
  }

  gsub ( /_QU_/, "qu" );
  gsub ( /__FFX__/, "ffix" );
  gsub ( /__I_X__/, "ix" );
  gsub ( /__A_X__/, "ax" );
  gsub ( /__O_X__/, "ox" );
  gsub ( /__U_X__/, "ux" );
  gsub ( /_T_X_T_/, "txt" );
  gsub ( /__N_X__/, "nx" );
  print;
}
