Skip to content

Instantly share code, notes, and snippets.

@clojens
Created November 22, 2014 02:14
Show Gist options
  • Save clojens/8fc00c0357e06f0ec00f to your computer and use it in GitHub Desktop.
Save clojens/8fc00c0357e06f0ec00f to your computer and use it in GitHub Desktop.

Revisions

  1. clojens created this gist Nov 22, 2014.
    486 changes: 486 additions & 0 deletions nl2pho2.pl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,486 @@
    #!/usr/bin/perl -w
    use English; # things like MATCH

    # --------------------------------------------------------------

    # txt2phoNL - usage: perl txt2phoNL <dutch-text.txt >phonemes.pho
    # the generated phoneme file is suitable for use with MBROLA,
    # but you have to use the -e option in MBROLA to skip over
    # spurious unpronounceable phoneme pairs (e.g. caused by English
    # words in your Dutch text file!).
    # Hint: Use pipes, e.g. "ls | txt2phoNL | mbrola -e - - | play"

    # This is GPLed software (open source freeware) by
    # Eric Auer <[email protected]>, the license is the GNU GPL
    # version 2 or later, also available as copying.txt in this
    # directory, http://www.coli.uni-saarland.de/~eric/stuff/soft (3/2002)

    # Please give me some feedback: As I am no native speaker
    # of Dutch, this txt2phoNL definitely need some improvement!

    # --------------------------------------------------------------

    # new version 14 feb 2002:
    # - sanitize away illegal phone pairs in a last step,
    # includes devoicing of consonants before a break.
    # - intermediate repn uses one char per phoneme.
    # - simpler rewrite mechanism eats all matched chars
    # and produces only phones - so the text string is constant.
    # BUT: restart from " " if the rule input ended in " " !
    # - steps: 1. digit/... names
    # 2. sound pattern rules (preferring long matches,
    # walking the string and trying all rules per char)
    # 3. sanitize and get final repn from intermediate one

    # new version 2003-04-05 by Marc Spoorendonk [email protected] (native Dutch speaker)
    # - changed to much to mention. Very acceptable translation now.

    my $XLATEDEBUG = 4; # show all translation rule applications
    # of at least this size

    # special one char repn:
    # _ is " ", Ei is 1, 9y is 3, Au is 4, ai is 5,
    # oi is 6, ui is 7, Ai is 8, Oi is 9, . is EOF, ? is question
    # , is comma

    open(STRING,">/dev/stderr") || die "cannot open debug log\n";
    # open(STRING,">nl2pho.log") || die "cannot open debug log\n";
    my $foo;

    $OUTPUT_AUTOFLUSH = 1; # (also known as $|): flush after every
    # write/print, do not buffer output
    $/ = undef; # do not split on line breaks
    # $/ is $RS, record separator in use English
    my $text0 = <STDIN>; # read stdin
    my $text = " "; # other stage (start with a space)
    my $phones = " "; # phoneme one-char-per-phoneme repn

    # by the way: a "^>*" remover would be nice for mails...


    # g vs G vs x: regen [reG@n] goal [goL] gage [xaZe]
    # where the G (voiced "ch") is a dialect alternative to x ("ch")
    # and the g only occurs in foreign words.

    # e vs E vs @: gemak [x@mAk] gage [xaZe] veer [ver] pet [pEt]
    # this is the len: e is long, is ee or e-at-end-of-syll.
    # E is short, is default, kind of.


    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    # first step: reduce the alphabet by spelling out specials
    # result: a plain [a-z.? ]* string

    my %special = ("0","null", "=","is",
    "1","een", "!","!",
    "2","twee", '"',"aanhaalingsteken",
    "3","drie",
    "4","vier", "\$","dollar",
    "5","vijf", "%","procent",
    "6","zes", "&","en",
    "7","zeven", "/","slesh", #phonetically
    "8","acht", "(","haakje openen",
    "9","negen", ")","haakje sluiten,",
    "*","ster", "\\","beckslesh", #phonetically
    "+","plus", "?","?",
    "#","hekje", "|","paip", #phonetically
    ".",".", "_","underscoor", #phonetically
    ",",",", "-","",
    ">","groter", ";",";",
    "<","kleiner",":",":",
    "^","dakje", "@","aapestaartje",
    "°","grad", "{","accolade openen",
    "[","hoekje", "]","hoekje sluiten,",
    "~","tilde", "}","accolade sluiten,"
    );

    # use this: punt. koma, vraagteken?
    # or that: . , ?
    # the latter has the problem that a . or , or ?
    # surrounded by spaces just sounds like a space...

    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    #Marc> Prefix with space for easyer matching.
    $text0 =~ s/^/ /g;
    $text0 =~ s/$/ /g;

    $text0 =~ s/^[>]*//g; # un-mailify the text :-)
    $text0 =~ s{://}{dubbele punt slesh slesh}g; # http:// and similar stuff
    $text0 =~ s{:-[)]}{,lachend gezicht}g; # smiley
    $text0 =~ s{:[)]}{,lachend gezicht}g; # smiley
    $text0 =~ s{:-[(]}{,treurig gezicht}g; # smiley
    $text0 =~ s{:[(]}{,treurig gezicht}g; # smiley
    $text0 =~ s{;-[)]}{,knipoogend gezicht}g; # smiley
    $text0 =~ s{;[)]}{,knipoogend gezicht}g; # smiley

    $text0 =~ s/cie/sie/g; # precies -> presies, provincie -> provinsie

    #Marc> betaal -> betaal
    #Marc> betalen -> betaalen
    #Marc> It keeps metten, marren, matten as they are.
    #Marc> betaling -> betaaling
    # b e t a l i ng bet a a li ng
    $text0 =~ s/([^eaiou][eaou][rtpsdfgklzcvbnm])([eaiou])([rtpsdfgklzcvbnm][eaiou])/$1$2$2$3/g;

    #Marc> meten -> meeten maren -> maaren
    # m e t e n m e e ten
    $text0 =~ s/([^eaiou])([eaou])([rtpsdfgklzcvbnm][eaiou][^eaiou])/$1$2$2$3/g;

    #Marc> k.n.m.i. -> k n m i
    $text0 =~ s/([^a-z])([a-z])\./$1$2 /g;
    $text0 =~ s/([^a-z])([a-z])\./$1$2 /g;

    #Marc> remove lines from input: "-----------------------------" -> ""
    $text0 =~ s/[-_=+]{3,}//g;

    #Marc> www.bla.com -> www punt bla punt com
    $text0 =~ s/\.([^ \n\t])/punt $1/g;

    #Marc> translate some numbers. (write a function for this once)
    $text0 =~ s/([^0-9])10([^0-9])/$1tien$2/g;
    $text0 =~ s/([^0-9])11([^0-9])/$1elf$2/g;
    $text0 =~ s/([^0-9])12([^0-9])/$1twaalf$2/g;
    $text0 =~ s/([^0-9])13([^0-9])/$1dertien$2/g;
    $text0 =~ s/([^0-9])14([^0-9])/$1veertien$2/g;
    $text0 =~ s/([^0-9])15([^0-9])/$1vijftien$2/g;
    $text0 =~ s/([^0-9])16([^0-9])/$1zestien$2/g;
    $text0 =~ s/([^0-9])17([^0-9])/$1zeventien$2/g;
    $text0 =~ s/([^0-9])18([^0-9])/$1achttien$2/g;
    $text0 =~ s/([^0-9])19([^0-9])/$1negentien$2/g;
    $text0 =~ s/([^0-9])20([^0-9])/$1twintig$2/g;
    $text0 =~ s/([^0-9])21([^0-9])/$1eenentwintig$2/g;
    $text0 =~ s/([^0-9])22([^0-9])/$1tweeentwintig$2/g;
    $text0 =~ s/([^0-9])23([^0-9])/$1drieentwintig$2/g;
    $text0 =~ s/([^0-9])24([^0-9])/$1vierentwintig$2/g;
    $text0 =~ s/([^0-9])25([^0-9])/$1vijfentwintig$2/g;
    $text0 =~ s/([^0-9])26([^0-9])/$1zesentwintig$2/g;
    $text0 =~ s/([^0-9])27([^0-9])/$1zevenentwintig$2/g;
    $text0 =~ s/([^0-9])28([^0-9])/$1achtentwintig$2/g;
    $text0 =~ s/([^0-9])29([^0-9])/$1negenentwintig$2/g;
    $text0 =~ s/([^0-9])30([^0-9])/$1dertig$2/g;
    $text0 =~ s/([^0-9])31([^0-9])/$1eenendertig$2/g;
    $text0 =~ s/([^0-9])32([^0-9])/$1tweeendertig$2/g;
    $text0 =~ s/([^0-9])33([^0-9])/$1drieendertig$2/g;
    $text0 =~ s/([^0-9])34([^0-9])/$1vierendertig$2/g;
    $text0 =~ s/([^0-9])35([^0-9])/$1vijfendertig$2/g;
    $text0 =~ s/([^0-9])36([^0-9])/$1zesendertig$2/g;
    $text0 =~ s/([^0-9])37([^0-9])/$1zevenendertig$2/g;
    $text0 =~ s/([^0-9])38([^0-9])/$1achtendertig$2/g;
    $text0 =~ s/([^0-9])39([^0-9])/$1negenendertig$2/g;

    print STDERR "Text: $text0";


    for my $char (split(//,$text0)) {
    $char = lc($char);
    $char = "eu" if ($char =~ /öÖ/); # approximately :-)
    $char = "ae" if ($char =~ /äÄ/); # could be better
    $char = "uu" if ($char =~ /Üü/); # should also be for &euml;
    if (defined $special{$char}) {
    $text .= " " unless ($text =~ / $/);
    $text .= $special{$char} . " ";
    } elsif ($char =~ /[a-z]/) {
    $text .= $char;
    } else {
    $text .= " " unless ($text =~ / $/);
    } # simplify all whitespace/linebreak stretches
    # and other special chars to single spaces
    }
    $text .= " " x 5; # end with spaces!

    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    # second step: apply phoneme pattern rules (prefer longest
    # match, eat up all left side apart from trailing space,
    # produce pure phoneme list)

    my %five = ( " lijk"," l1k",
    "lijk ",'l@k ',
    "elijk","El1k",
    " bent", " bEnt",
    "atie ","atsi", #informatie
    "+size+",5
    );

    my %four = ( "http","ha te te pe ",
    "html","ha te Em El ",
    "agen","axEn",
    "ooie","oi@", # mooie
    "ooit","oIt",
    " er "," Er ",
    " en "," En ",
    " nl "," EnEl ",
    " he "," hE ",
    " ok "," oke ",
    "hou ","h4w ",
    "ouch","uS", # douche
    "oush","uS", # kianoush
    " pc "," pe se ",
    "even",'ev@n',
    "tie ","tsi", #vakantie
    " chi"," Si", # china
    "+size+",4
    );

    my %three = (

    "aai","5" ,
    "ooi","oi" ,
    "oei","7",
    "cee","se",
    "ai ","8" ,
    "oi ","9",
    "age","aZe",
    "ch ","x" ,
    "ftp","ef te pe ",
    "www","we we we ",
    "htm","ha te em ",
    "tp:", "te pe ",
    "mp ", "Em pe ", # mp3
    "mb ", "Embe ", # mp3
    "eeu", "e2",
    "en ",'@n',
    "he ","he",
    "eij","1",
    #pronounciation of E before double dissonant
    "ett","Et", #letter
    "epp","Ep",
    "ett","Et",
    "err","Er",
    "ekk","Ek", #lekker
    "emm","Em",
    "ess","Es",
    "eff","Ef",
    "ell","El",
    "ebb","Eb",
    "enn","En",
    #Marc> distinct letters: k.n.m.i a.u.b.
    " a ", "a",
    " b ", "be",
    " c ", "se",
    " d ", "de",
    " e ", "e",
    " f ", "Ef",
    " g ", "xe",
    " h ", "ha",
    " i ", "i",
    " j ", "ie",
    " k ", "ka",
    " l ", "El",
    " m ", "Em",
    " n ", "En",
    " o ", "o",
    " p ", "pe",
    " q ", "ky",
    " r ", "Er",
    " s ", "Es",
    " t ", "te",
    " u ", "y",
    " v ", "ve",
    " w ", "we",
    " x ", "Iks",
    " y ", "1",
    " z ", "zEt",
    "+size+",3
    );

    my %two = ("ie","i" , "oe","u" , "uu","y",
    "aa","a" , "ee","e" , "oo","o",
    "eu","2" , "ei","1",
    "ui","3" , "ou","4" , "ij","1",
    "sj","S" , "g ","x" , "nj","J",
    "ce","sE",
    "l ","l" , "ng","N" ,
    "dt","t" , "ch","x" , "iu","ju",
    "dl",'d@l', "lf",'l@f',
    "bb","b" , "dd","d" , "e ",'@',
    "d ","t" , "hr","r" , "hl","l",
    "o ","o" , "a ", "a",
    "yl","1l",
    "zl","z l",
    "mm", "m", # Marc> m-m is not a sound. Same for p-p and n-n.
    "pp", "p",
    "nn", "n",
    "rr", "r",
    "kk", "k",
    "tt", "t",
    "+size+",2
    ); # hr/hl/yl/zl: sane processing
    # of foreign words

    my %one = ("a","A", "b","b", "c","k",
    "d","d", "e",'E', "f","f",
    "g","x", "h","h", "i","I",
    "j","j", "k","k", "l","l",
    "m","m", "n","n", "o","O",
    "p","p", "q","k", "r","r",
    "s","s", "t","t", "u","Y",
    "v","v", "w","w", "x","ks",
    "y","j", "z","z", " "," ",
    ".",".", "?","?", ",",",",
    "+size+",1
    ); # prosody with [ ?.,] is a later step

    my @todo = (\%five, \%four, \%three, \%two, \%one);

    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    $phones = " ";
    my $x = 0; # string index
    while ($x < (length($text)-5)) {
    my $y = 0;
    for my $hashref (@todo) { # do l-longest rules first...
    if ($y != 0) { next; }
    my $check = substr($text,$x,$hashref->{"+size+"});
    if (defined $hashref->{$check}) {
    $phones .= $hashref->{$check};
    $x += $hashref->{"+size+"};
    $x-- if (($check =~ / $/) && ($check ne " "));
    # skip over matched part, but rewind on " " suffix
    $y++;
    print STDERR "Translate: <$check> to /"
    . $hashref->{$check} . "/\n"
    if (length($check) >= $XLATEDEBUG);
    }
    }
    if ($y == 0) {
    print STDERR "Had to translate first char to NIL:\n";
    print STDERR "<" . substr($text,$x,10) . "...>\n";
    $phones .= " ";
    $x++;
    }
    }

    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    # third step: convert to SAMPA alphabet and apply constraints
    # on phoneme pairings (input: $phones string)

    my %xlate = ("1","Ei", "3","9y", "4","Au", "5","ai",
    "6","oi", "7","ui", "8","Ai", "9","Oi",
    " ","_", "?","_", ".","_", ",","_"
    );

    my $Pvowel = "aeiouAEIOy2Y13456789";
    my $Pdipht = "56789";
    my $Pvoiced = "bdcvzZGhJjg"; # adding g for convenience
    my $Pconson = "ptkbdgcfvszSZxGhmn";
    my $Pvoice2 = "czZGhJj";
    my $Psemi = "GNJL";
    my $Pspace = ".?,_ ";

    # rules:

    # handled above: no "EY" or "IY" (replace by eY and iY)
    # handled above: no d before l (add schwa)
    # handled above: common case of bb and dd (replace by b and d)
    # handled above: commod case of d_ (devoice to t_)

    # no voiced/semi doubled (replace by single occurance)
    # no schwa before OR AFTER dipht (remove schwa)
    # no voice2 before l, r or j (add schwa)
    # no voiced before conson (add schwa ; duplication rule first)
    # no conson before semi (add schwa)
    # special case of next rule: j-E (replace by j-@)
    # no dipht before or after vowel/j (insert " ", see above)
    # no voiced at the end of a word (devoice)

    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

    $text = "";
    my $adder = ""; # buffer before we really add the phone!
    my $freq = 200; # freq in Hz, only used at " " for now
    my $dur = 100; # duration in msec
    my $ph; # current phoneme
    my $ph0 = " "; # previous phoneme
    # the prosody and rhythm are still extremely simple

    foreach $ph (split(//,$phones)) {

    if (($ph =~ /[${Psemi}${Pvoiced}]/) && ($ph0 eq $ph)) {
    print STDERR "${ph}-$ph: removeone $ph\n";
    $adder = ""; # ignore first copy of double phoneme
    } elsif (($ph0 eq "@") && ($ph =~ /[${Pdipht}]/)) {
    # remove the previous schwa
    # (or just insert a short "h")
    $adder = "";
    print STDERR "\@-$ph: remove \@\n";
    } elsif (($ph eq "@") && ($ph0 =~ /[${Pdipht}]/)) {
    # remove the current schwa
    $ph = "";
    print STDERR "${ph0}-\@: remove \@\n";
    } elsif ( (($ph0 =~ /[${Pvoice2}]/) && ($ph =~ /lrj/))
    || (($ph0 =~ /[${Pvoiced}]/) &&
    ($ph =~ /[${Pconson}]/))
    || (($ph0 =~ /[${Pconson}]/) &&
    ($ph =~ /[${Psemi}]/))
    ) {
    $adder .= "\@ 50\n";
    print STDERR "${ph0}-$ph: insert schwa\n";
    } elsif (($ph0 eq "j") && ($ph eq "E")) {
    print STDERR "j-E: changeto j-\@\n";
    $ph = "@"; # modify this part this time...
    } elsif ( (($ph0 =~ /[${Pdipht}j]/) &&
    ($ph =~ /[${Pvowel}j]/))
    || (($ph0 =~ /[${Pvowel}j]/) &&
    ($ph =~ /[${Pdipht}j]/))
    ) {
    if ($ph0 eq "j") {
    $adder = "i 100\n";
    print STDERR "${ph0}-$ph: changeto i-$ph\n";
    }
    if ($ph eq "j") {
    print STDERR "${ph0}-$ph: changeto ${ph0}-i\n";
    $ph = "i";
    }
    if (($ph0 ne "j") && ($ph ne "j")) {
    $adder .= "_ 50\n";
    print STDERR "${ph0}-$ph: insert break\n";
    }
    } elsif (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pspace}]/)) {
    my $de = $ph0;
    $de =~ tr/bdcvzZGhJjg/ptxfsSx_IIk/;
    # ptxfsSx IIk
    print STDERR "${ph0}-_: changeto ${de}-_\n";
    $adder = "$de 100\n";
    }

    if (($ph0 eq "j") && ($ph =~ /[${Pspace}]/)) {
    print STDERR "j-_: insert \@\n";
    $adder .= "\@ 100\n";
    }

    if ($adder) {
    $text .= $adder; # add possibly corrected recent phoneme
    $adder =~ s{^([^ ]*).*$}
    {$1}gm; # reduce to phonemes, multiline
    die "<$adder> ?\n" if ($adder =~ / /);
    $adder = join("-",split(/\n/,$adder)); # a\nb\n -> a-b-
    print STRING "${adder}-";
    }

    if ($ph) {
    $dur = ($ph =~ /[iuyaeo213456789rmnNJ]/) ? 200 : 100;
    # longer for long vowels/rmnNJ
    $freq = 200 if ($ph eq " "); # default freq
    $freq = 252 if ($ph eq "?"); # go up for questions
    $freq = 159 if ($ph eq "."); # go down for boundaries
    $freq = 178 if ($ph eq ","); # go down a bit for commas
    if ($ph =~ /[${Pspace}]/) { # various breaks
    $adder = "_ 100 (50 , $freq)\n";
    } else {
    $adder = ( (defined $xlate{$ph}) ? $xlate{$ph} : $ph );
    # use 1..2 char phone names
    $adder .= " $dur\n";
    }
    } else {
    print STDERR "Skip\n";
    $adder = "";
    }

    $ph0 = $ph;

    }

    print "$text\n";

    print STRING "\n";
    close STRING;