Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save Lysander6/d17ca0aa5ecb80b049334e03c467e572 to your computer and use it in GitHub Desktop.

Select an option

Save Lysander6/d17ca0aa5ecb80b049334e03c467e572 to your computer and use it in GitHub Desktop.

Revisions

  1. Lysander6 revised this gist May 8, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion polish_sentence_nltk_tokenizer.py
    Original file line number Diff line number Diff line change
    @@ -16,7 +16,7 @@

    lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum']

    military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł']
    military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł', 'nadkom']

    extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev

  2. @ksopyla ksopyla revised this gist Apr 1, 2020. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions polish_sentence_nltk_tokenizer.py
    Original file line number Diff line number Diff line change
    @@ -23,8 +23,6 @@
    sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
    sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)

    sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
    sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)

    text = '.....'

  3. @ksopyla ksopyla revised this gist Feb 27, 2020. 1 changed file with 8 additions and 11 deletions.
    19 changes: 8 additions & 11 deletions polish_sentence_nltk_tokenizer.py
    Original file line number Diff line number Diff line change
    @@ -4,24 +4,21 @@
    # nltk.download()
    nltk.download('punkt')

    extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr' 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan' ]
    extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ]

    position_abbrev = ['bp', 'dr', 'prof', 'zwycz', 'hab', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med' 'bł', 'św' ]
    position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ]

    roman_abbrev= ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XII','XIV','XV','XVI', 'XVII', 'XVIII','XIX', 'XX', 'XXI' ]
    quantity_abbrev = [ 'mln', 'obr./min','km/godz', 'godz', 'egz', 'ha', 'j.m', 'cal', 'obj', 'alk', 'wag' ] # not added: tys.

    quantity_abbrev = [ 'mln', 'tys', 'km/godz', 'obr./min' 'godz', 'egz']
    actions_abbrev = ['tłum','tlum','zob','wym', 'pot', 'ww', 'ogł', 'wyd', 'min', 'm.i', 'm.in', 'in', 'im','muz','tj', 'dot', 'wsp', 'właść', 'właśc', 'przedr', 'czyt', 'proj', 'dosł', 'hist', 'daw', 'zwł', 'zaw' ]

    actions_abbrev = ['tłum', 'zob','wym', 'pot', 'ww', 'ogł', 'tzn', 'wyd', 'min', 'm.i', 'm.in', 'm. in' 'im','muz','tj', 'dot', 'wsp', 'właść', 'przedr', 'czyt', 'proj', 'dosł' ]
    place_abbrev = ['Śl', 'płd', 'geogr']

    place_abbrev = ['Śl', 'płd']
    lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum']

    lang_abbrev = ['jęz', 'fr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet']
    military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł']

    military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr', 'płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz' 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw']

    # concat all lists
    extra_abbreviations= extra_abbreviations + position_abbrev + roman_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev
    extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev

    sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
    sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)
  4. @ksopyla ksopyla revised this gist Feb 19, 2020. No changes.
  5. @ksopyla ksopyla revised this gist Feb 18, 2020. No changes.
  6. @ksopyla ksopyla revised this gist Feb 18, 2020. 1 changed file with 19 additions and 4 deletions.
    23 changes: 19 additions & 4 deletions polish_sentence_nltk_tokenizer.py
    Original file line number Diff line number Diff line change
    @@ -4,12 +4,27 @@
    # nltk.download()
    nltk.download('punkt')

    extra_abbreviations = ['ps', 'dr', 'prof', 'med' 'inc', 'Corp' 'pkt', 'pot', 'zob', 'm.i', 'Dz.Ap', 'Jr', 'tłum', 'sp', 'muz', 'ww', 'Śl', 'zob', 'poj', 'm.in', 'wyd', 'im']
    extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr' 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', '', 'Takht', 'tzw', 't.zw', 'ewan' ]

    lang_abbrev = ['ang', 'gr', 'hebr', 'czes', 'pol']
    military_abbrev = ['mjr', 'płk', 'dypl', 'pp' 'dyw', 'bryg', 'ppłk', 'marsz' 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw']
    position_abbrev = ['bp', 'dr', 'prof', 'zwycz', 'hab', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med' 'bł', 'św' ]

    extra_abbreviations = extra_abbreviations+lang_abbrev+military_abbrev
    roman_abbrev= ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XII','XIV','XV','XVI', 'XVII', 'XVIII','XIX', 'XX', 'XXI' ]

    quantity_abbrev = [ 'mln', 'tys', 'km/godz', 'obr./min' 'godz', 'egz']

    actions_abbrev = ['tłum', 'zob','wym', 'pot', 'ww', 'ogł', 'tzn', 'wyd', 'min', 'm.i', 'm.in', 'm. in' 'im','muz','tj', 'dot', 'wsp', 'właść', 'przedr', 'czyt', 'proj', 'dosł' ]

    place_abbrev = ['Śl', 'płd']

    lang_abbrev = ['jęz', 'fr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet']

    military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr', 'płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz' 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw']

    # concat all lists
    extra_abbreviations= extra_abbreviations + position_abbrev + roman_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev

    sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
    sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)

    sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
    sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)
  7. @ksopyla ksopyla renamed this gist Feb 18, 2020. 1 changed file with 0 additions and 0 deletions.
  8. @ksopyla ksopyla revised this gist Feb 18, 2020. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions polish_sentence_tokenizer.py
    Original file line number Diff line number Diff line change
    @@ -4,10 +4,10 @@
    # nltk.download()
    nltk.download('punkt')

    extra_abbreviations = ['ps', 'dr', 'prof', 'med' 'inc', 'Corp' 'pkt', 'pot', 'zob', 'm.i', 'Dz.Ap', 'Jr', 'tłum', , 'sp', 'muz', 'ww', 'Śl', 'zob', 'poj', 'm.in', 'wyd', 'im']
    extra_abbreviations = ['ps', 'dr', 'prof', 'med' 'inc', 'Corp' 'pkt', 'pot', 'zob', 'm.i', 'Dz.Ap', 'Jr', 'tłum', 'sp', 'muz', 'ww', 'Śl', 'zob', 'poj', 'm.in', 'wyd', 'im']

    lang_abbrev = ['ang', 'gr', 'hebr', 'czes', 'pol']
    military_abbrev = ['mjr','płk', 'dypl', 'pp' 'dyw', 'bryg', 'ppłk', 'marsz' 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw']
    military_abbrev = ['mjr', 'płk', 'dypl', 'pp' 'dyw', 'bryg', 'ppłk', 'marsz' 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw']

    extra_abbreviations = extra_abbreviations+lang_abbrev+military_abbrev

  9. @ksopyla ksopyla created this gist Feb 18, 2020.
    19 changes: 19 additions & 0 deletions polish_sentence_tokenizer.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,19 @@
    import nltk

    # interactive download
    # nltk.download()
    nltk.download('punkt')

    extra_abbreviations = ['ps', 'dr', 'prof', 'med' 'inc', 'Corp' 'pkt', 'pot', 'zob', 'm.i', 'Dz.Ap', 'Jr', 'tłum', , 'sp', 'muz', 'ww', 'Śl', 'zob', 'poj', 'm.in', 'wyd', 'im']

    lang_abbrev = ['ang', 'gr', 'hebr', 'czes', 'pol']
    military_abbrev = ['mjr','płk', 'dypl', 'pp' 'dyw', 'bryg', 'ppłk', 'marsz' 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw']

    extra_abbreviations = extra_abbreviations+lang_abbrev+military_abbrev

    sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle')
    sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)

    text = '.....'

    sentences = sentence_tokenizer.tokenize(text)