Skip to content

Instantly share code, notes, and snippets.

@gruzovator
Created March 17, 2017 06:18
Show Gist options
  • Select an option

  • Save gruzovator/1f0dc242d69f44e277f14347339491d7 to your computer and use it in GitHub Desktop.

Select an option

Save gruzovator/1f0dc242d69f44e277f14347339491d7 to your computer and use it in GitHub Desktop.

Revisions

  1. gruzovator created this gist Mar 17, 2017.
    61 changes: 61 additions & 0 deletions translit.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,61 @@
    #include <unicode/translit.h>

    class TranslitConverter
    {
    static const UnicodeString TRANSLITERATION_RULES;

    public:
    TranslitConverter()
    {
    UErrorCode status = U_ZERO_ERROR;
    UParseError parserError;

    m_transliterator.reset(Transliterator::createFromRules ("TranslitConverter",
    TRANSLITERATION_RULES,
    UTRANS_FORWARD, parserError, status));
    if(!m_transliterator || U_FAILURE (status))
    {
    throw std::runtime_error("Failed to create translit convereter");
    }
    }

    ~TranslitConverter() {}

    std::string translit(const std::string &utf8string)
    {
    UnicodeString data = icu::UnicodeString::fromUTF8(utf8string);
    m_transliterator->transliterate(data);
    std::string result;
    data.toUTF8String(result);
    return result;
    }

    private:
    boost::scoped_ptr<icu::Transliterator> m_transliterator;

    };

    const UnicodeString TranslitConverter::TRANSLITERATION_RULES(
    "::NFKD;"
    // fix for old icu lin
    "х > kh;"
    "Х > Kh;"
    "\\/ > \\-;"
    //
    "::Russian-Latin/BGN;"
    "::[:Nonspacing Mark:] Remove;"
    "::NFC;"
    "::lower;"
    "::[^a-z0-9[:separator:]-] Remove;"
    "[[:separator:]-]+ > \\-;"
    );

    std::string translit(const std::string &s)
    {
    static boost::thread_specific_ptr<TranslitConverter> converterPtr;
    if(!converterPtr.get())
    {
    converterPtr.reset(new TranslitConverter);
    }
    return converterPtr->translit(s);
    }