clojens · November 22, 2014 02:14 · Nov 22, 2014
diff --git a/nl2pho2.pl b/nl2pho2.pl
@@ -0,0 +1,486 @@
+#!/usr/bin/perl -w
+use English; # things like MATCH
+
+# --------------------------------------------------------------
+
+# txt2phoNL - usage: perl txt2phoNL <dutch-text.txt >phonemes.pho
+# the generated phoneme file is suitable for use with MBROLA,
+# but you have to use the -e option in MBROLA to skip over
+# spurious unpronounceable phoneme pairs (e.g. caused by English
+# words in your Dutch text file!).
+# Hint: Use pipes, e.g. "ls | txt2phoNL | mbrola -e - - | play"
+
+# This is GPLed software (open source freeware) by
+# Eric Auer <[email protected]>, the license is the GNU GPL
+# version 2 or later, also available as copying.txt in this
+# directory, http://www.coli.uni-saarland.de/~eric/stuff/soft (3/2002)
+
+# Please give me some feedback: As I am no native speaker
+# of Dutch, this txt2phoNL definitely need some improvement!
+
+# --------------------------------------------------------------
+
+# new version 14 feb 2002:
+# - sanitize away illegal phone pairs in a last step,
+#   includes devoicing of consonants before a break.
+# - intermediate repn uses one char per phoneme.
+# - simpler rewrite mechanism eats all matched chars
+#   and produces only phones - so the text string is constant.
+#   BUT: restart from " " if the rule input ended in " " !
+# - steps: 1. digit/... names
+#          2. sound pattern rules (preferring long matches,
+#             walking the string and trying all rules per char)
+#          3. sanitize and get final repn from intermediate one
+
+# new version 2003-04-05 by Marc Spoorendonk [email protected] (native Dutch speaker)
+# - changed to much to mention. Very acceptable translation now.
+
+my $XLATEDEBUG = 4; # show all translation rule applications
+                    # of at least this size
+
+# special one char repn:
+# _ is " ", Ei is 1, 9y is 3, Au is 4, ai is 5,
+# oi is  6, ui is 7, Ai is 8, Oi is 9, . is EOF, ? is question
+# , is comma
+
+open(STRING,">/dev/stderr") || die "cannot open debug log\n";
+# open(STRING,">nl2pho.log") || die "cannot open debug log\n";
+my $foo;
+
+$OUTPUT_AUTOFLUSH = 1; # (also known as $|): flush after every
+                       # write/print, do not buffer output
+$/ = undef;       # do not split on line breaks
+                  # $/ is $RS, record separator in use English
+my $text0 = <STDIN>;   # read stdin
+my $text = " ";  # other stage (start with a space)
+my $phones = " "; # phoneme one-char-per-phoneme repn
+
+# by the way: a "^>*" remover would be nice for mails...
+
+
+# g vs G vs x: regen [reG@n] goal [goL]  gage [xaZe]
+# where the G (voiced "ch") is a dialect alternative to x ("ch")
+# and the g only occurs in foreign words.
+
+# e vs E vs @: gemak [x@mAk] gage [xaZe] veer [ver]  pet [pEt]
+# this is the len: e is long, is ee or e-at-end-of-syll.
+# E is short, is default, kind of.
+
+
+# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# first step: reduce the alphabet by spelling out specials
+# result: a plain [a-z.? ]* string
+
+my %special = ("0","null",   "=","is",
+               "1","een",    "!","!",
+               "2","twee",   '"',"aanhaalingsteken",
+               "3","drie",
+               "4","vier",   "\$","dollar",
+               "5","vijf",   "%","procent",
+               "6","zes",    "&","en",
+               "7","zeven",  "/","slesh", #phonetically
+               "8","acht",   "(","haakje openen",
+               "9","negen",  ")","haakje sluiten,",
+               "*","ster",   "\\","beckslesh", #phonetically
+               "+","plus",   "?","?",
+               "#","hekje",  "|","paip", #phonetically
+               ".",".",      "_","underscoor", #phonetically
+               ",",",",      "-","",
+               ">","groter", ";",";",
+               "<","kleiner",":",":",
+               "^","dakje",  "@","aapestaartje",
+               "°","grad",   "{","accolade openen",
+               "[","hoekje", "]","hoekje sluiten,",
+               "~","tilde",  "}","accolade sluiten,"
+              );
+
+# use this: punt. koma, vraagteken?
+#  or that: .     ,     ?
+# the latter has the problem that a . or , or ?
+# surrounded by spaces just sounds like a space...
+
+# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+#Marc> Prefix with space for easyer matching.
+$text0 =~ s/^/ /g; 
+$text0 =~ s/$/ /g; 
+
+$text0 =~ s/^[>]*//g; # un-mailify the text :-)
+$text0 =~ s{://}{dubbele punt slesh slesh}g; # http:// and similar stuff
+$text0 =~ s{:-[)]}{,lachend gezicht}g;    # smiley
+$text0 =~ s{:[)]}{,lachend gezicht}g;    # smiley
+$text0 =~ s{:-[(]}{,treurig gezicht}g;    # smiley
+$text0 =~ s{:[(]}{,treurig gezicht}g;    # smiley
+$text0 =~ s{;-[)]}{,knipoogend gezicht}g; # smiley
+$text0 =~ s{;[)]}{,knipoogend gezicht}g; # smiley
+
+$text0 =~ s/cie/sie/g; # precies -> presies, provincie -> provinsie
+
+#Marc>  betaal -> betaal
+#Marc>  betalen -> betaalen
+#Marc> It keeps metten, marren, matten as they are.
+#Marc> betaling -> betaaling 
+#                b      e           t             a          l             i ng bet a a li ng
+$text0 =~ s/([^eaiou][eaou][rtpsdfgklzcvbnm])([eaiou])([rtpsdfgklzcvbnm][eaiou])/$1$2$2$3/g;
+
+#Marc>  meten -> meeten    maren -> maaren
+#                m       e             t           e        n     m e e ten
+$text0 =~ s/([^eaiou])([eaou])([rtpsdfgklzcvbnm][eaiou][^eaiou])/$1$2$2$3/g;
+
+#Marc> k.n.m.i. -> k n m i
+$text0 =~ s/([^a-z])([a-z])\./$1$2 /g;
+$text0 =~ s/([^a-z])([a-z])\./$1$2 /g;
+
+#Marc> remove lines from input: "-----------------------------" -> "" 
+$text0 =~ s/[-_=+]{3,}//g;
+
+#Marc> www.bla.com -> www punt bla punt com
+$text0 =~ s/\.([^ \n\t])/punt $1/g;
+
+#Marc> translate some numbers. (write a function for this once)
+$text0 =~ s/([^0-9])10([^0-9])/$1tien$2/g;
+$text0 =~ s/([^0-9])11([^0-9])/$1elf$2/g;
+$text0 =~ s/([^0-9])12([^0-9])/$1twaalf$2/g;
+$text0 =~ s/([^0-9])13([^0-9])/$1dertien$2/g;
+$text0 =~ s/([^0-9])14([^0-9])/$1veertien$2/g;
+$text0 =~ s/([^0-9])15([^0-9])/$1vijftien$2/g;
+$text0 =~ s/([^0-9])16([^0-9])/$1zestien$2/g;
+$text0 =~ s/([^0-9])17([^0-9])/$1zeventien$2/g;
+$text0 =~ s/([^0-9])18([^0-9])/$1achttien$2/g;
+$text0 =~ s/([^0-9])19([^0-9])/$1negentien$2/g;
+$text0 =~ s/([^0-9])20([^0-9])/$1twintig$2/g;
+$text0 =~ s/([^0-9])21([^0-9])/$1eenentwintig$2/g;
+$text0 =~ s/([^0-9])22([^0-9])/$1tweeentwintig$2/g;
+$text0 =~ s/([^0-9])23([^0-9])/$1drieentwintig$2/g;
+$text0 =~ s/([^0-9])24([^0-9])/$1vierentwintig$2/g;
+$text0 =~ s/([^0-9])25([^0-9])/$1vijfentwintig$2/g;
+$text0 =~ s/([^0-9])26([^0-9])/$1zesentwintig$2/g;
+$text0 =~ s/([^0-9])27([^0-9])/$1zevenentwintig$2/g;
+$text0 =~ s/([^0-9])28([^0-9])/$1achtentwintig$2/g;
+$text0 =~ s/([^0-9])29([^0-9])/$1negenentwintig$2/g;
+$text0 =~ s/([^0-9])30([^0-9])/$1dertig$2/g;
+$text0 =~ s/([^0-9])31([^0-9])/$1eenendertig$2/g;
+$text0 =~ s/([^0-9])32([^0-9])/$1tweeendertig$2/g;
+$text0 =~ s/([^0-9])33([^0-9])/$1drieendertig$2/g;
+$text0 =~ s/([^0-9])34([^0-9])/$1vierendertig$2/g;
+$text0 =~ s/([^0-9])35([^0-9])/$1vijfendertig$2/g;
+$text0 =~ s/([^0-9])36([^0-9])/$1zesendertig$2/g;
+$text0 =~ s/([^0-9])37([^0-9])/$1zevenendertig$2/g;
+$text0 =~ s/([^0-9])38([^0-9])/$1achtendertig$2/g;
+$text0 =~ s/([^0-9])39([^0-9])/$1negenendertig$2/g;
+
+print STDERR "Text: $text0";
+
+
+for my $char (split(//,$text0)) {
+  $char = lc($char);
+  $char = "eu" if ($char =~ /öÖ/); # approximately :-)
+  $char = "ae" if ($char =~ /äÄ/); # could be better
+  $char = "uu" if ($char =~ /Üü/); # should also be for &euml;
+  if (defined $special{$char}) {
+    $text .= " " unless ($text =~ / $/);
+    $text .= $special{$char} . " ";
+  } elsif ($char =~ /[a-z]/) {
+    $text .= $char;
+  } else {
+    $text .= " " unless ($text =~ / $/);
+  }   # simplify all whitespace/linebreak stretches
+      # and other special chars to single spaces
+}
+$text .= " " x 5; # end with spaces!
+
+# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# second step: apply phoneme pattern rules (prefer longest
+# match, eat up all left side apart from trailing space,
+# produce pure phoneme list)
+
+my %five  = (	" lijk"," l1k", 
+		"lijk ",'l@k ',
+             	"elijk","El1k",
+		" bent", " bEnt",
+		"atie ","atsi",		#informatie
+             	"+size+",5
+            );
+
+my %four  = (	"http","ha te te pe ",
+             	"html","ha te Em El ",
+		"agen","axEn",
+             	"ooie","oi@",	# mooie
+		"ooit","oIt",
+		" er "," Er ",
+		" en "," En ",
+		" nl "," EnEl ",
+		" he "," hE ",
+		" ok "," oke ",
+             	"hou ","h4w ",
+		"ouch","uS",	# douche
+		"oush","uS",	# kianoush
+		" pc "," pe se ",
+             	"even",'ev@n',
+		"tie ","tsi",	#vakantie
+		" chi"," Si",	# china
+             	"+size+",4
+            );
+
+my %three = (	
+
+		"aai","5" ,
+		"ooi","oi" ,
+		"oei","7",
+		"cee","se",
+             	"ai ","8" ,
+		"oi ","9",
+		"age","aZe",
+             	"ch ","x" ,
+		"ftp","ef te pe ",
+             	"www","we we we ",
+             	"htm","ha te em ",
+             	"tp:", "te pe ",
+		"mp ", "Em pe ", # mp3
+		"mb ", "Embe ", # mp3
+             	"eeu", "e2",
+		"en ",'@n',
+		"he ","he",
+		"eij","1",
+#pronounciation of E before double dissonant
+		"ett","Et", #letter
+		"epp","Ep",
+		"ett","Et",
+		"err","Er",
+		"ekk","Ek", #lekker
+		"emm","Em",
+		"ess","Es",
+		"eff","Ef",
+		"ell","El",
+		"ebb","Eb",
+		"enn","En",
+#Marc> distinct letters:  k.n.m.i   a.u.b.
+		" a ", "a",
+		" b ", "be",
+		" c ", "se",
+		" d ", "de",
+		" e ", "e",
+		" f ", "Ef",
+		" g ", "xe",
+		" h ", "ha",
+		" i ", "i",
+		" j ", "ie",
+		" k ", "ka",
+		" l ", "El",
+		" m ", "Em",
+		" n ", "En",
+		" o ", "o",
+		" p ", "pe",
+		" q ", "ky",
+		" r ", "Er",
+		" s ", "Es",
+		" t ", "te",
+		" u ", "y",
+		" v ", "ve",
+		" w ", "we",
+		" x ", "Iks",
+		" y ", "1",
+		" z ", "zEt",
+             	"+size+",3
+            ); 
+
+my %two   = ("ie","i" , "oe","u" , "uu","y",
+             "aa","a" , "ee","e" , "oo","o",
+             "eu","2" ,            "ei","1",
+             "ui","3" , "ou","4" , "ij","1",
+             "sj","S" , "g ","x" , "nj","J",
+	     "ce","sE",
+             "l ","l" , "ng","N" ,
+             "dt","t" , "ch","x" , "iu","ju",
+             "dl",'d@l',           "lf",'l@f',
+             "bb","b" , "dd","d" , "e ",'@',
+             "d ","t" , "hr","r" , "hl","l",
+             "o ","o" , "a ", "a",
+	     "yl","1l",
+             "zl","z l",
+	     "mm", "m", # Marc> m-m is not a sound. Same for p-p and n-n.
+	     "pp", "p",
+	     "nn", "n",
+	     "rr", "r",
+	     "kk", "k",
+	     "tt", "t",
+             "+size+",2
+            ); # hr/hl/yl/zl: sane processing
+               # of foreign words
+
+my %one   = ("a","A", "b","b", "c","k",
+             "d","d", "e",'E', "f","f",
+             "g","x", "h","h", "i","I",
+             "j","j", "k","k", "l","l",
+             "m","m", "n","n", "o","O",
+             "p","p", "q","k", "r","r",
+             "s","s", "t","t", "u","Y",
+             "v","v", "w","w", "x","ks",
+             "y","j", "z","z", " "," ",
+             ".",".", "?","?", ",",",",
+             "+size+",1
+            ); # prosody with [ ?.,] is a later step
+
+my @todo = (\%five, \%four, \%three, \%two, \%one);
+
+# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+$phones = " ";
+my $x = 0;     # string index
+while ($x < (length($text)-5)) {
+  my $y = 0;
+  for my $hashref (@todo) {    # do l-longest rules first...
+    if ($y != 0) { next; }
+    my $check = substr($text,$x,$hashref->{"+size+"});
+    if (defined $hashref->{$check}) {
+      $phones .= $hashref->{$check};
+      $x += $hashref->{"+size+"};
+      $x-- if (($check =~ / $/) && ($check ne " "));
+      # skip over matched part, but rewind on " " suffix
+      $y++;
+      print STDERR "Translate: <$check> to /"
+        .   $hashref->{$check} . "/\n"
+        if  (length($check) >= $XLATEDEBUG);
+    }
+  }
+  if ($y == 0) {
+    print STDERR "Had to translate first char to NIL:\n";
+    print STDERR "<" . substr($text,$x,10) . "...>\n";
+    $phones .= " ";
+    $x++;
+  }
+}
+
+# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# third step: convert to SAMPA alphabet and apply constraints
+# on phoneme pairings (input: $phones string)
+
+my %xlate = ("1","Ei", "3","9y", "4","Au", "5","ai",
+             "6","oi", "7","ui", "8","Ai", "9","Oi",
+             " ","_",  "?","_",  ".","_",  ",","_"
+            );
+
+my $Pvowel  = "aeiouAEIOy2Y13456789";
+my $Pdipht  = "56789";
+my $Pvoiced = "bdcvzZGhJjg"; # adding g for convenience
+my $Pconson = "ptkbdgcfvszSZxGhmn";
+my $Pvoice2 = "czZGhJj";
+my $Psemi   = "GNJL";
+my $Pspace  = ".?,_ ";
+
+# rules:
+
+# handled above: no "EY" or "IY" (replace by eY and iY)
+# handled above: no d before l (add schwa)
+# handled above: common case of bb and dd (replace by b and d)
+# handled above: commod case of d_ (devoice to t_)
+
+# no voiced/semi doubled (replace by single occurance)
+# no schwa   before OR AFTER dipht (remove schwa)
+# no voice2  before l, r or j (add schwa)
+# no voiced  before conson (add schwa ; duplication rule first)
+# no conson  before semi (add schwa)
+# special case of next rule: j-E (replace by j-@)
+# no dipht before or after vowel/j (insert " ", see above)
+# no voiced  at the end of a word (devoice)
+
+# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+$text = "";
+my $adder = ""; # buffer before we really add the phone!
+my $freq = 200; # freq in Hz, only used at " " for now
+my $dur = 100;  # duration in msec
+my $ph;         # current phoneme
+my $ph0 = " ";  # previous phoneme
+# the prosody and rhythm are still extremely simple
+
+foreach $ph (split(//,$phones)) {
+
+  if (($ph =~ /[${Psemi}${Pvoiced}]/) && ($ph0 eq $ph)) {
+    print STDERR "${ph}-$ph: removeone $ph\n";
+    $adder = ""; # ignore first copy of double phoneme
+  } elsif (($ph0 eq "@") && ($ph =~ /[${Pdipht}]/)) {
+    # remove the previous schwa
+    # (or just insert a short "h")
+    $adder = "";
+    print STDERR "\@-$ph: remove \@\n";
+  } elsif (($ph eq "@") && ($ph0 =~ /[${Pdipht}]/)) {
+    # remove the current schwa
+    $ph = "";
+    print STDERR "${ph0}-\@: remove \@\n";
+  } elsif (  (($ph0 =~ /[${Pvoice2}]/) && ($ph =~ /lrj/))
+          || (($ph0 =~ /[${Pvoiced}]/) && 
+              ($ph  =~ /[${Pconson}]/))
+          || (($ph0 =~ /[${Pconson}]/) &&
+              ($ph  =~ /[${Psemi}]/))
+          ) {
+    $adder .= "\@ 50\n";
+    print STDERR "${ph0}-$ph: insert schwa\n";
+  } elsif (($ph0 eq "j") && ($ph eq "E")) {
+    print STDERR "j-E: changeto j-\@\n";
+    $ph = "@"; # modify this part this time...
+  } elsif (  (($ph0 =~ /[${Pdipht}j]/) && 
+              ($ph  =~ /[${Pvowel}j]/))
+          || (($ph0 =~ /[${Pvowel}j]/) && 
+              ($ph  =~ /[${Pdipht}j]/))
+          ) {
+    if ($ph0 eq "j") {
+      $adder = "i 100\n";
+      print STDERR "${ph0}-$ph: changeto i-$ph\n";
+    }
+    if ($ph eq "j") {
+      print STDERR "${ph0}-$ph: changeto ${ph0}-i\n";
+      $ph = "i";
+    }
+    if (($ph0 ne "j") && ($ph ne "j")) {
+      $adder .= "_ 50\n";
+      print STDERR "${ph0}-$ph: insert break\n";
+    }
+  } elsif (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pspace}]/)) {
+    my $de = $ph0;
+    $de =~ tr/bdcvzZGhJjg/ptxfsSx_IIk/;
+          #  ptxfsSx IIk
+    print STDERR "${ph0}-_: changeto ${de}-_\n";
+    $adder = "$de 100\n";
+  }
+
+  if (($ph0 eq "j") && ($ph =~ /[${Pspace}]/)) {
+    print STDERR "j-_: insert \@\n";
+    $adder .= "\@ 100\n";
+  }
+
+  if ($adder) {
+    $text .= $adder; # add possibly corrected recent phoneme
+    $adder =~ s{^([^ ]*).*$}
+               {$1}gm;   # reduce to phonemes, multiline
+    die "<$adder> ?\n" if ($adder =~ / /);
+    $adder = join("-",split(/\n/,$adder)); # a\nb\n -> a-b-
+    print STRING "${adder}-";
+  }
+
+  if ($ph) {
+    $dur = ($ph =~ /[iuyaeo213456789rmnNJ]/) ? 200 : 100;
+                                  # longer for long vowels/rmnNJ
+    $freq = 200 if ($ph eq " ");  # default freq
+    $freq = 252 if ($ph eq "?");  # go up for questions
+    $freq = 159 if ($ph eq ".");  # go down for boundaries
+    $freq = 178 if ($ph eq ",");  # go down a bit for commas
+    if ($ph =~ /[${Pspace}]/) {   # various breaks
+      $adder = "_ 100 (50 , $freq)\n";
+    } else {
+      $adder = ( (defined $xlate{$ph}) ? $xlate{$ph} : $ph );
+                                  # use 1..2 char phone names
+      $adder .= " $dur\n";
+    }
+  } else {
+    print STDERR "Skip\n";
+    $adder = "";
+  }
+
+  $ph0 = $ph;
+
+}
+
+print "$text\n";
+
+print STRING "\n";
+close STRING;