Skip to content

Instantly share code, notes, and snippets.

@Tricked-dev
Created August 10, 2021 09:44
Show Gist options
  • Save Tricked-dev/e4d73f63238fa9cdd9f81ad2cbb892e9 to your computer and use it in GitHub Desktop.
Save Tricked-dev/e4d73f63238fa9cdd9f81ad2cbb892e9 to your computer and use it in GitHub Desktop.
summarizer
var n = {};
n.endsWithChar = function ends_with_char(e, n) {
return n.length > 1 ? n.indexOf(e.slice(-1)) > -1 : e.slice(-1) === n;
};
n.endsWith = function ends_with(e, n) {
return e.slice(e.length - n.length) === n;
};
var i = {};
var t;
var r = [
'al',
'adj',
'assn',
'Ave',
'BSc',
'MSc',
'Cell',
'Ch',
'Co',
'cc',
'Corp',
'Dem',
'Dept',
'ed',
'eg',
'Eq',
'Eqs',
'est',
'est',
'etc',
'Ex',
'ext',
'Fig',
'fig',
'Figs',
'figs',
'i.e',
'ie',
'Inc',
'inc',
'Jan',
'Feb',
'Mar',
'Apr',
'Jun',
'Jul',
'Aug',
'Sep',
'Sept',
'Oct',
'Nov',
'Dec',
'jr',
'mi',
'Miss',
'Mrs',
'Mr',
'Ms',
'Mol',
'mt',
'mts',
'no',
'Nos',
'PhD',
'MD',
'BA',
'MA',
'MM',
'pl',
'pop',
'pp',
'Prof',
'Dr',
'pt',
'Ref',
'Refs',
'Rep',
'repr',
'rev',
'Sec',
'Secs',
'Sgt',
'Col',
'Gen',
'Rep',
'Sen',
'Gov',
'Lt',
'Maj',
'Capt',
'St',
'Sr',
'sr',
'Jr',
'jr',
'Rev',
'Sun',
'Mon',
'Tu',
'Tue',
'Tues',
'Wed',
'Th',
'Thu',
'Thur',
'Thurs',
'Fri',
'Sat',
'trans',
'Univ',
'Viz',
'Vol',
'vs',
'v',
];
i.setAbbreviations = function (e) {
t = e || r;
};
var s = (i.isCapitalized = function (e) {
return /^[A-Z][a-z].*/.test(e) || a(e);
});
i.isSentenceStarter = function (e) {
return s(e) || /``|"|'/.test(e.substring(0, 2));
};
i.isCommonAbbreviation = function (e) {
var n = e.replace(/[-'`~!@#$%^&*()_|+=?;:'",.<>\{\}\[\]\\\/]/gi, '');
return ~t.indexOf(n);
};
i.isTimeAbbreviation = function (e, n) {
if ('a.m.' === e || 'p.m.' === e) {
var i = n.replace(/\W+/g, '').slice(-3).toLowerCase();
if ('day' === i) return true;
}
return false;
};
i.isDottedAbbreviation = function (e) {
var n = e.replace(/[\(\)\[\]\{\}]/g, '').match(/(.\.)*/);
return n && n[0].length > 0;
};
i.isCustomAbbreviation = function (e) {
return e.length <= 3 || s(e);
};
i.isNameAbbreviation = function (e, n) {
if (n.length > 0) {
if (e < 5 && n[0].length < 6 && s(n[0])) return true;
var i = n.filter(function (e) {
return /[A-Z]/.test(e.charAt(0));
});
return i.length >= 3;
}
return false;
};
var a = (i.isNumber = function (e, n) {
n && (e = e.slice(n - 1, n + 2));
return !isNaN(e);
});
i.isPhoneNr = function (e) {
return e.match(
/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/
);
};
i.isURL = function (e) {
return e.match(
/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/
);
};
i.isConcatenated = function (e) {
var n = 0;
if (
(n = e.indexOf('.')) > -1 ||
(n = e.indexOf('!')) > -1 ||
(n = e.indexOf('?')) > -1
) {
var i = e.charAt(n + 1);
if (i.match(/[a-zA-Z].*/)) return [e.slice(0, n), e.slice(n + 1)];
}
return false;
};
i.isBoundaryChar = function (e) {
return '.' === e || '!' === e || '?' === e;
};
var u = {};
var f = n;
var c = i;
var h = ' @~@ ';
var v = h.trim();
var p = new RegExp('\\S', '');
var b = new RegExp('\\n+|[-#=_+*]{4,}', 'g');
var d = new RegExp('\\S+|\\n', 'g');
u.sentences = function (e, n) {
if (!e || 'string' !== typeof e || !e.length) return [];
if (!p.test(e)) return [];
var i = {
newline_boundaries: false,
html_boundaries: false,
html_boundaries_tags: ['p', 'div', 'ul', 'ol'],
sanitize: false,
allowed_tags: false,
preserve_whitespace: false,
abbreviations: null,
};
if ('boolean' === typeof n) i.newline_boundaries = true;
else for (var t in n) i[t] = n[t];
c.setAbbreviations(i.abbreviations);
i.newline_boundaries && (e = e.replace(b, h));
if (i.html_boundaries) {
var r = '(<br\\s*\\/?>|<\\/(' + i.html_boundaries_tags.join('|') + ')>)';
var s = new RegExp(r, 'g');
e = e.replace(s, '$1' + h);
}
if (i.sanitize || i.allowed_tags) {
i.allowed_tags || (i.allowed_tags = ['']);
e = l(e, { allowedTags: i.allowed_tags });
}
var a;
var o;
if (i.preserve_whitespace) {
o = e.split(/(<br\s*\/?>|\S+|\n+)/);
a = o.filter(function (e, n) {
return n % 2;
});
} else a = e.trim().match(d);
var u = 0;
var g = 0;
var m = [];
var _ = [];
var A = [];
if (!a || !a.length) return [];
for (var w = 0, C = a.length; w < C; w++) {
u++;
A.push(a[w]);
~a[w].indexOf(',') && (u = 0);
if (c.isBoundaryChar(a[w]) || f.endsWithChar(a[w], '?!') || a[w] === v) {
(i.newline_boundaries || i.html_boundaries) && a[w] === v && A.pop();
_.push(A);
u = 0;
A = [];
} else {
(f.endsWithChar(a[w], '"') || f.endsWithChar(a[w], '”')) &&
(a[w] = a[w].slice(0, -1));
if (f.endsWithChar(a[w], '.')) {
if (w + 1 < C) {
if (2 === a[w].length && isNaN(a[w].charAt(0))) continue;
if (c.isCommonAbbreviation(a[w])) continue;
if (c.isSentenceStarter(a[w + 1])) {
if (c.isTimeAbbreviation(a[w], a[w + 1])) continue;
if (c.isNameAbbreviation(u, a.slice(w, 6))) continue;
if (c.isNumber(a[w + 1]) && c.isCustomAbbreviation(a[w])) continue;
} else {
if (f.endsWith(a[w], '..')) continue;
if (c.isDottedAbbreviation(a[w])) continue;
if (c.isNameAbbreviation(u, a.slice(w, 5))) continue;
}
}
_.push(A);
A = [];
u = 0;
} else {
if ((g = a[w].indexOf('.')) > -1) {
if (c.isNumber(a[w], g)) continue;
if (c.isDottedAbbreviation(a[w])) continue;
if (c.isURL(a[w]) || c.isPhoneNr(a[w])) continue;
}
if ((m = c.isConcatenated(a[w]))) {
A.pop();
A.push(m[0]);
_.push(A);
A = [];
u = 0;
A.push(m[1]);
}
}
}
}
A.length && _.push(A);
_ = _.filter(function (e) {
return e.length > 0;
});
var S = _.slice(1).reduce(
function (e, n) {
var i = e[e.length - 1];
if (1 === i.length && /^.{1,2}[.]$/.test(i[0]) && !/[.]/.test(n[0])) {
e.pop();
e.push(i.concat(n));
return e;
}
e.push(n);
return e;
},
[_[0]]
);
return S.map(function (e, n) {
if (i.preserve_whitespace && !i.newline_boundaries && !i.html_boundaries) {
var t = 2 * e.length;
0 === n && (t += 1);
return o.splice(0, t).join('');
}
return e.join(' ');
});
};
const g = u.sentences;
export default u;
export { g as sentences };
//# sourceMappingURL=tokenizer.js.map
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment