Forked from ksopyla/polish_sentence_nltk_tokenizer.py
Last active
May 8, 2020 18:02
-
-
Save Lysander6/d17ca0aa5ecb80b049334e03c467e572 to your computer and use it in GitHub Desktop.
Revisions
-
Lysander6 revised this gist
May 8, 2020 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,7 +16,7 @@ lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum'] military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł', 'nadkom'] extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev -
ksopyla revised this gist
Apr 1, 2020 . 1 changed file with 0 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -23,8 +23,6 @@ sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle') sentence_tokenizer._params.abbrev_types.update(extra_abbreviations) text = '.....' -
ksopyla revised this gist
Feb 27, 2020 . 1 changed file with 8 additions and 11 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,24 +4,21 @@ # nltk.download() nltk.download('punkt') extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr', 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan', 'tyt', 'oryg', 't.j', 'vs', 'l.mn', 'l.poj' ] position_abbrev = ['Ks', 'Abp', 'abp','bp','dr', 'kard', 'mgr', 'prof', 'zwycz', 'hab', 'arch', 'arch.kraj', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med', 'bł', 'św', 'hr', 'dziek' ] quantity_abbrev = [ 'mln', 'obr./min','km/godz', 'godz', 'egz', 'ha', 'j.m', 'cal', 'obj', 'alk', 'wag' ] # not added: tys. actions_abbrev = ['tłum','tlum','zob','wym', 'pot', 'ww', 'ogł', 'wyd', 'min', 'm.i', 'm.in', 'in', 'im','muz','tj', 'dot', 'wsp', 'właść', 'właśc', 'przedr', 'czyt', 'proj', 'dosł', 'hist', 'daw', 'zwł', 'zaw' ] place_abbrev = ['Śl', 'płd', 'geogr'] lang_abbrev = ['jęz','fr','franc', 'ukr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet', 'sum'] military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr','płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz', 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw', 'p.uł'] extra_abbreviations= extra_abbreviations + position_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle') sentence_tokenizer._params.abbrev_types.update(extra_abbreviations) -
ksopyla revised this gist
Feb 19, 2020 . No changes.There are no files selected for viewing
-
ksopyla revised this gist
Feb 18, 2020 . No changes.There are no files selected for viewing
-
ksopyla revised this gist
Feb 18, 2020 . 1 changed file with 19 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,12 +4,27 @@ # nltk.download() nltk.download('punkt') extra_abbreviations = ['ps', 'inc', 'Corp', 'Ltd', 'Co', 'pkt', 'Dz.Ap', 'Jr', 'jr' 'sp', 'Sp', 'poj', 'pseud', 'krypt', 'sygn', 'Dz.U', 'ws', 'itd', 'np', 'sanskryt', 'nr', 'gł', 'Takht', 'tzw', 't.zw', 'ewan' ] position_abbrev = ['bp', 'dr', 'prof', 'zwycz', 'hab', 'B.Sc', 'Ph.D', 'lek', 'med', 'n.med' 'bł', 'św' ] roman_abbrev= ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XII','XIV','XV','XVI', 'XVII', 'XVIII','XIX', 'XX', 'XXI' ] quantity_abbrev = [ 'mln', 'tys', 'km/godz', 'obr./min' 'godz', 'egz'] actions_abbrev = ['tłum', 'zob','wym', 'pot', 'ww', 'ogł', 'tzn', 'wyd', 'min', 'm.i', 'm.in', 'm. in' 'im','muz','tj', 'dot', 'wsp', 'właść', 'przedr', 'czyt', 'proj', 'dosł' ] place_abbrev = ['Śl', 'płd'] lang_abbrev = ['jęz', 'fr', 'ang', 'gr', 'hebr', 'czes', 'pol', 'niem', 'arab', 'egip', 'hiszp', 'jap', 'chin', 'kor', 'tyb', 'wiet'] military_abbrev = ['kpt', 'kpr', 'obs', 'pil', 'mjr', 'płk', 'dypl', 'pp', 'gw', 'dyw', 'bryg', 'ppłk', 'mar', 'marsz' 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw'] # concat all lists extra_abbreviations= extra_abbreviations + position_abbrev + roman_abbrev + quantity_abbrev + place_abbrev + actions_abbrev + place_abbrev + lang_abbrev+military_abbrev sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle') sentence_tokenizer._params.abbrev_types.update(extra_abbreviations) sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle') sentence_tokenizer._params.abbrev_types.update(extra_abbreviations) -
ksopyla renamed this gist
Feb 18, 2020 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
ksopyla revised this gist
Feb 18, 2020 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -4,10 +4,10 @@ # nltk.download() nltk.download('punkt') extra_abbreviations = ['ps', 'dr', 'prof', 'med' 'inc', 'Corp' 'pkt', 'pot', 'zob', 'm.i', 'Dz.Ap', 'Jr', 'tłum', 'sp', 'muz', 'ww', 'Śl', 'zob', 'poj', 'm.in', 'wyd', 'im'] lang_abbrev = ['ang', 'gr', 'hebr', 'czes', 'pol'] military_abbrev = ['mjr', 'płk', 'dypl', 'pp' 'dyw', 'bryg', 'ppłk', 'marsz' 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw'] extra_abbreviations = extra_abbreviations+lang_abbrev+military_abbrev -
ksopyla created this gist
Feb 18, 2020 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,19 @@ import nltk # interactive download # nltk.download() nltk.download('punkt') extra_abbreviations = ['ps', 'dr', 'prof', 'med' 'inc', 'Corp' 'pkt', 'pot', 'zob', 'm.i', 'Dz.Ap', 'Jr', 'tłum', , 'sp', 'muz', 'ww', 'Śl', 'zob', 'poj', 'm.in', 'wyd', 'im'] lang_abbrev = ['ang', 'gr', 'hebr', 'czes', 'pol'] military_abbrev = ['mjr','płk', 'dypl', 'pp' 'dyw', 'bryg', 'ppłk', 'marsz' 'rez', 'ppor', 'DPanc', 'BPanc', 'DKaw'] extra_abbreviations = extra_abbreviations+lang_abbrev+military_abbrev sentence_tokenizer = nltk.data.load('tokenizers/punkt/polish.pickle') sentence_tokenizer._params.abbrev_types.update(extra_abbreviations) text = '.....' sentences = sentence_tokenizer.tokenize(text)