Created
August 10, 2021 09:44
-
-
Save Tricked-dev/e4d73f63238fa9cdd9f81ad2cbb892e9 to your computer and use it in GitHub Desktop.
summarizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| var n = {}; | |
| n.endsWithChar = function ends_with_char(e, n) { | |
| return n.length > 1 ? n.indexOf(e.slice(-1)) > -1 : e.slice(-1) === n; | |
| }; | |
| n.endsWith = function ends_with(e, n) { | |
| return e.slice(e.length - n.length) === n; | |
| }; | |
| var i = {}; | |
| var t; | |
| var r = [ | |
| 'al', | |
| 'adj', | |
| 'assn', | |
| 'Ave', | |
| 'BSc', | |
| 'MSc', | |
| 'Cell', | |
| 'Ch', | |
| 'Co', | |
| 'cc', | |
| 'Corp', | |
| 'Dem', | |
| 'Dept', | |
| 'ed', | |
| 'eg', | |
| 'Eq', | |
| 'Eqs', | |
| 'est', | |
| 'est', | |
| 'etc', | |
| 'Ex', | |
| 'ext', | |
| 'Fig', | |
| 'fig', | |
| 'Figs', | |
| 'figs', | |
| 'i.e', | |
| 'ie', | |
| 'Inc', | |
| 'inc', | |
| 'Jan', | |
| 'Feb', | |
| 'Mar', | |
| 'Apr', | |
| 'Jun', | |
| 'Jul', | |
| 'Aug', | |
| 'Sep', | |
| 'Sept', | |
| 'Oct', | |
| 'Nov', | |
| 'Dec', | |
| 'jr', | |
| 'mi', | |
| 'Miss', | |
| 'Mrs', | |
| 'Mr', | |
| 'Ms', | |
| 'Mol', | |
| 'mt', | |
| 'mts', | |
| 'no', | |
| 'Nos', | |
| 'PhD', | |
| 'MD', | |
| 'BA', | |
| 'MA', | |
| 'MM', | |
| 'pl', | |
| 'pop', | |
| 'pp', | |
| 'Prof', | |
| 'Dr', | |
| 'pt', | |
| 'Ref', | |
| 'Refs', | |
| 'Rep', | |
| 'repr', | |
| 'rev', | |
| 'Sec', | |
| 'Secs', | |
| 'Sgt', | |
| 'Col', | |
| 'Gen', | |
| 'Rep', | |
| 'Sen', | |
| 'Gov', | |
| 'Lt', | |
| 'Maj', | |
| 'Capt', | |
| 'St', | |
| 'Sr', | |
| 'sr', | |
| 'Jr', | |
| 'jr', | |
| 'Rev', | |
| 'Sun', | |
| 'Mon', | |
| 'Tu', | |
| 'Tue', | |
| 'Tues', | |
| 'Wed', | |
| 'Th', | |
| 'Thu', | |
| 'Thur', | |
| 'Thurs', | |
| 'Fri', | |
| 'Sat', | |
| 'trans', | |
| 'Univ', | |
| 'Viz', | |
| 'Vol', | |
| 'vs', | |
| 'v', | |
| ]; | |
| i.setAbbreviations = function (e) { | |
| t = e || r; | |
| }; | |
| var s = (i.isCapitalized = function (e) { | |
| return /^[A-Z][a-z].*/.test(e) || a(e); | |
| }); | |
| i.isSentenceStarter = function (e) { | |
| return s(e) || /``|"|'/.test(e.substring(0, 2)); | |
| }; | |
| i.isCommonAbbreviation = function (e) { | |
| var n = e.replace(/[-'`~!@#$%^&*()_|+=?;:'",.<>\{\}\[\]\\\/]/gi, ''); | |
| return ~t.indexOf(n); | |
| }; | |
| i.isTimeAbbreviation = function (e, n) { | |
| if ('a.m.' === e || 'p.m.' === e) { | |
| var i = n.replace(/\W+/g, '').slice(-3).toLowerCase(); | |
| if ('day' === i) return true; | |
| } | |
| return false; | |
| }; | |
| i.isDottedAbbreviation = function (e) { | |
| var n = e.replace(/[\(\)\[\]\{\}]/g, '').match(/(.\.)*/); | |
| return n && n[0].length > 0; | |
| }; | |
| i.isCustomAbbreviation = function (e) { | |
| return e.length <= 3 || s(e); | |
| }; | |
| i.isNameAbbreviation = function (e, n) { | |
| if (n.length > 0) { | |
| if (e < 5 && n[0].length < 6 && s(n[0])) return true; | |
| var i = n.filter(function (e) { | |
| return /[A-Z]/.test(e.charAt(0)); | |
| }); | |
| return i.length >= 3; | |
| } | |
| return false; | |
| }; | |
| var a = (i.isNumber = function (e, n) { | |
| n && (e = e.slice(n - 1, n + 2)); | |
| return !isNaN(e); | |
| }); | |
| i.isPhoneNr = function (e) { | |
| return e.match( | |
| /^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/ | |
| ); | |
| }; | |
| i.isURL = function (e) { | |
| return e.match( | |
| /[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/ | |
| ); | |
| }; | |
| i.isConcatenated = function (e) { | |
| var n = 0; | |
| if ( | |
| (n = e.indexOf('.')) > -1 || | |
| (n = e.indexOf('!')) > -1 || | |
| (n = e.indexOf('?')) > -1 | |
| ) { | |
| var i = e.charAt(n + 1); | |
| if (i.match(/[a-zA-Z].*/)) return [e.slice(0, n), e.slice(n + 1)]; | |
| } | |
| return false; | |
| }; | |
| i.isBoundaryChar = function (e) { | |
| return '.' === e || '!' === e || '?' === e; | |
| }; | |
| var u = {}; | |
| var f = n; | |
| var c = i; | |
| var h = ' @~@ '; | |
| var v = h.trim(); | |
| var p = new RegExp('\\S', ''); | |
| var b = new RegExp('\\n+|[-#=_+*]{4,}', 'g'); | |
| var d = new RegExp('\\S+|\\n', 'g'); | |
| u.sentences = function (e, n) { | |
| if (!e || 'string' !== typeof e || !e.length) return []; | |
| if (!p.test(e)) return []; | |
| var i = { | |
| newline_boundaries: false, | |
| html_boundaries: false, | |
| html_boundaries_tags: ['p', 'div', 'ul', 'ol'], | |
| sanitize: false, | |
| allowed_tags: false, | |
| preserve_whitespace: false, | |
| abbreviations: null, | |
| }; | |
| if ('boolean' === typeof n) i.newline_boundaries = true; | |
| else for (var t in n) i[t] = n[t]; | |
| c.setAbbreviations(i.abbreviations); | |
| i.newline_boundaries && (e = e.replace(b, h)); | |
| if (i.html_boundaries) { | |
| var r = '(<br\\s*\\/?>|<\\/(' + i.html_boundaries_tags.join('|') + ')>)'; | |
| var s = new RegExp(r, 'g'); | |
| e = e.replace(s, '$1' + h); | |
| } | |
| if (i.sanitize || i.allowed_tags) { | |
| i.allowed_tags || (i.allowed_tags = ['']); | |
| e = l(e, { allowedTags: i.allowed_tags }); | |
| } | |
| var a; | |
| var o; | |
| if (i.preserve_whitespace) { | |
| o = e.split(/(<br\s*\/?>|\S+|\n+)/); | |
| a = o.filter(function (e, n) { | |
| return n % 2; | |
| }); | |
| } else a = e.trim().match(d); | |
| var u = 0; | |
| var g = 0; | |
| var m = []; | |
| var _ = []; | |
| var A = []; | |
| if (!a || !a.length) return []; | |
| for (var w = 0, C = a.length; w < C; w++) { | |
| u++; | |
| A.push(a[w]); | |
| ~a[w].indexOf(',') && (u = 0); | |
| if (c.isBoundaryChar(a[w]) || f.endsWithChar(a[w], '?!') || a[w] === v) { | |
| (i.newline_boundaries || i.html_boundaries) && a[w] === v && A.pop(); | |
| _.push(A); | |
| u = 0; | |
| A = []; | |
| } else { | |
| (f.endsWithChar(a[w], '"') || f.endsWithChar(a[w], '”')) && | |
| (a[w] = a[w].slice(0, -1)); | |
| if (f.endsWithChar(a[w], '.')) { | |
| if (w + 1 < C) { | |
| if (2 === a[w].length && isNaN(a[w].charAt(0))) continue; | |
| if (c.isCommonAbbreviation(a[w])) continue; | |
| if (c.isSentenceStarter(a[w + 1])) { | |
| if (c.isTimeAbbreviation(a[w], a[w + 1])) continue; | |
| if (c.isNameAbbreviation(u, a.slice(w, 6))) continue; | |
| if (c.isNumber(a[w + 1]) && c.isCustomAbbreviation(a[w])) continue; | |
| } else { | |
| if (f.endsWith(a[w], '..')) continue; | |
| if (c.isDottedAbbreviation(a[w])) continue; | |
| if (c.isNameAbbreviation(u, a.slice(w, 5))) continue; | |
| } | |
| } | |
| _.push(A); | |
| A = []; | |
| u = 0; | |
| } else { | |
| if ((g = a[w].indexOf('.')) > -1) { | |
| if (c.isNumber(a[w], g)) continue; | |
| if (c.isDottedAbbreviation(a[w])) continue; | |
| if (c.isURL(a[w]) || c.isPhoneNr(a[w])) continue; | |
| } | |
| if ((m = c.isConcatenated(a[w]))) { | |
| A.pop(); | |
| A.push(m[0]); | |
| _.push(A); | |
| A = []; | |
| u = 0; | |
| A.push(m[1]); | |
| } | |
| } | |
| } | |
| } | |
| A.length && _.push(A); | |
| _ = _.filter(function (e) { | |
| return e.length > 0; | |
| }); | |
| var S = _.slice(1).reduce( | |
| function (e, n) { | |
| var i = e[e.length - 1]; | |
| if (1 === i.length && /^.{1,2}[.]$/.test(i[0]) && !/[.]/.test(n[0])) { | |
| e.pop(); | |
| e.push(i.concat(n)); | |
| return e; | |
| } | |
| e.push(n); | |
| return e; | |
| }, | |
| [_[0]] | |
| ); | |
| return S.map(function (e, n) { | |
| if (i.preserve_whitespace && !i.newline_boundaries && !i.html_boundaries) { | |
| var t = 2 * e.length; | |
| 0 === n && (t += 1); | |
| return o.splice(0, t).join(''); | |
| } | |
| return e.join(' '); | |
| }); | |
| }; | |
| const g = u.sentences; | |
| export default u; | |
| export { g as sentences }; | |
| //# sourceMappingURL=tokenizer.js.map |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment