psorianom · August 23, 2014 13:40 · Aug 23, 2014
diff --git a/text.py b/text.py
@@ -0,0 +1,1370 @@
+# -*- coding: utf-8 -*-
+# Authors: Olivier Grisel <[email protected]>
+# Mathieu Blondel <[email protected]>
+# Lars Buitinck <[email protected]>
+# Robert Layton <[email protected]>
+#          Jochen Wersdörfer <[email protected]>
+#          Roman Sinayev <[email protected]>
+#
+# License: BSD 3 clause
+"""
+The :mod:`sklearn.feature_extraction.text` submodule gathers utilities to
+build feature vectors from text documents.
+"""
+from __future__ import unicode_literals
+
+import array
+from collections import Mapping, defaultdict
+import numbers
+from operator import itemgetter
+import re
+import unicodedata
+
+import numpy as np
+import scipy.sparse as sp
+
+from ..base import BaseEstimator, TransformerMixin
+from ..externals.six.moves import xrange
+from ..preprocessing import normalize
+from .hashing import FeatureHasher
+from .stop_words import ENGLISH_STOP_WORDS
+from ..utils import deprecated
+from ..externals import six
+
+
+__all__ = ['CountVectorizer',
+           'ENGLISH_STOP_WORDS',
+           'TfidfTransformer',
+           'TfidfVectorizer',
+           'strip_accents_ascii',
+           'strip_accents_unicode',
+           'strip_tags']
+
+
+def strip_accents_unicode(s):
+    """Transform accentuated unicode symbols into their simple counterpart
+
+    Warning: the python-level loop and join operations make this
+    implementation 20 times slower than the strip_accents_ascii basic
+    normalization.
+
+    See also
+    --------
+    strip_accents_ascii
+        Remove accentuated char for any unicode symbol that has a direct
+        ASCII equivalent.
+    """
+    return ''.join([c for c in unicodedata.normalize('NFKD', s)
+                    if not unicodedata.combining(c)])
+
+
+def strip_accents_ascii(s):
+    """Transform accentuated unicode symbols into ascii or nothing
+
+    Warning: this solution is only suited for languages that have a direct
+    transliteration to ASCII symbols.
+
+    See also
+    --------
+    strip_accents_unicode
+        Remove accentuated char for any unicode symbol.
+    """
+    nkfd_form = unicodedata.normalize('NFKD', s)
+    return nkfd_form.encode('ASCII', 'ignore').decode('ASCII')
+
+
+def strip_tags(s):
+    """Basic regexp based HTML / XML tag stripper function
+
+    For serious HTML/XML preprocessing you should rather use an external
+    library such as lxml or BeautifulSoup.
+    """
+    return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s)
+
+
+def _check_stop_list(stop):
+    if stop == "english":
+        return ENGLISH_STOP_WORDS
+    elif isinstance(stop, six.string_types):
+        raise ValueError("not a built-in stop list: %s" % stop)
+    else:  # assume it's a collection
+        return stop
+
+
+class VectorizerMixin(object):
+    """Provides common code for text vectorizers (tokenization logic)."""
+
+    _white_spaces = re.compile(r"\s\s+")
+
+    def decode(self, doc):
+        """Decode the input into a string of unicode symbols
+
+        The decoding strategy depends on the vectorizer parameters.
+        """
+        if self.input == 'filename':
+            with open(doc, 'rb') as fh:
+                doc = fh.read()
+
+        elif self.input == 'file':
+            doc = doc.read()
+
+        if isinstance(doc, bytes):
+            doc = doc.decode(self.encoding, self.decode_error)
+
+        if doc is np.nan:
+            raise ValueError("np.nan is an invalid document, expected byte or "
+                             "unicode string.")
+
+        return doc
+
+    def _word_ngrams(self, tokens, stop_words=None):
+        """Turn tokens into a sequence of n-grams after stop words filtering"""
+        # handle stop words
+        if stop_words is not None:
+            tokens = [w for w in tokens if w not in stop_words]
+
+        # handle token n-grams
+        min_n, max_n = self.ngram_range
+        if max_n != 1:
+            original_tokens = tokens
+            tokens = []
+            n_original_tokens = len(original_tokens)
+            for n in xrange(min_n,
+                            min(max_n + 1, n_original_tokens + 1)):
+                for i in xrange(n_original_tokens - n + 1):
+                    tokens.append(" ".join(original_tokens[i: i + n]))
+
+        return tokens
+
+    def _char_ngrams(self, text_document):
+        """Tokenize text_document into a sequence of character n-grams"""
+        # normalize white spaces
+        text_document = self._white_spaces.sub(" ", text_document)
+
+        text_len = len(text_document)
+        ngrams = []
+        min_n, max_n = self.ngram_range
+        for n in xrange(min_n, min(max_n + 1, text_len + 1)):
+            for i in xrange(text_len - n + 1):
+                ngrams.append(text_document[i: i + n])
+        return ngrams
+
+    def _char_wb_ngrams(self, text_document):
+        """Whitespace sensitive char-n-gram tokenization.
+
+        Tokenize text_document into a sequence of character n-grams
+        excluding any whitespace (operating only inside word boundaries)"""
+        # normalize white spaces
+        text_document = self._white_spaces.sub(" ", text_document)
+
+        min_n, max_n = self.ngram_range
+        ngrams = []
+        for w in text_document.split():
+            w = ' ' + w + ' '
+            w_len = len(w)
+            for n in xrange(min_n, max_n + 1):
+                offset = 0
+                ngrams.append(w[offset:offset + n])
+                while offset + n < w_len:
+                    offset += 1
+                    ngrams.append(w[offset:offset + n])
+                if offset == 0:  # count a short word (w_len < n) only once
+                    break
+        return ngrams
+
+    def build_preprocessor(self):
+        """Return a function to preprocess the text before tokenization"""
+        if self.preprocessor is not None:
+            return self.preprocessor
+
+        # unfortunately python functools package does not have an efficient
+        # `compose` function that would have allowed us to chain a dynamic
+        # number of functions. However the cost of a lambda call is a few
+        # hundreds of nanoseconds which is negligible when compared to the
+        # cost of tokenizing a string of 1000 chars for instance.
+        noop = lambda x: x
+
+        # accent stripping
+        if not self.strip_accents:
+            strip_accents = noop
+        elif callable(self.strip_accents):
+            strip_accents = self.strip_accents
+        elif self.strip_accents == 'ascii':
+            strip_accents = strip_accents_ascii
+        elif self.strip_accents == 'unicode':
+            strip_accents = strip_accents_unicode
+        else:
+            raise ValueError('Invalid value for "strip_accents": %s' %
+                             self.strip_accents)
+
+        if self.lowercase:
+            return lambda x: strip_accents(x.lower())
+        else:
+            return strip_accents
+
+    def build_tokenizer(self):
+        """Return a function that splits a string into a sequence of tokens"""
+        if self.tokenizer is not None:
+            return self.tokenizer
+        token_pattern = re.compile(self.token_pattern)
+        return lambda doc: token_pattern.findall(doc)
+
+    def get_stop_words(self):
+        """Build or fetch the effective stop words list"""
+        return _check_stop_list(self.stop_words)
+
+    def build_analyzer(self):
+        """Return a callable that handles preprocessing and tokenization"""
+        if callable(self.analyzer):
+            return self.analyzer
+
+        preprocess = self.build_preprocessor()
+
+        if self.analyzer == 'char':
+            return lambda doc: self._char_ngrams(preprocess(self.decode(doc)))
+
+        elif self.analyzer == 'char_wb':
+            return lambda doc: self._char_wb_ngrams(
+                preprocess(self.decode(doc)))
+
+        elif self.analyzer == 'word':
+            stop_words = self.get_stop_words()
+            tokenize = self.build_tokenizer()
+
+            return lambda doc: self._word_ngrams(
+                tokenize(preprocess(self.decode(doc))), stop_words)
+
+        else:
+            raise ValueError('%s is not a valid tokenization scheme/analyzer' %
+                             self.analyzer)
+
+    def _check_vocabulary(self):
+        vocabulary = self.vocabulary
+        if vocabulary is not None:
+            if not isinstance(vocabulary, Mapping):
+                vocab = {}
+                for i, t in enumerate(vocabulary):
+                    if vocab.setdefault(t, i) != i:
+                        msg = "Duplicate term in vocabulary: %r" % t
+                        raise ValueError(msg)
+                vocabulary = vocab
+            else:
+                indices = set(six.itervalues(vocabulary))
+                if len(indices) != len(vocabulary):
+                    raise ValueError("Vocabulary contains repeated indices.")
+                for i in xrange(len(vocabulary)):
+                    if i not in indices:
+                        msg = ("Vocabulary of size %d doesn't contain index "
+                               "%d." % (len(vocabulary), i))
+                        raise ValueError(msg)
+            if not vocabulary:
+                raise ValueError("empty vocabulary passed to fit")
+            self.fixed_vocabulary_ = True
+            self.vocabulary_ = dict(vocabulary)
+        else:
+            self.fixed_vocabulary_ = False
+
+    @property
+    @deprecated("The `fixed_vocabulary` attribute is deprecated and will be "
+                "removed in 0.18.  Please use `fixed_vocabulary_` instead.")
+    def fixed_vocabulary(self):
+        return self.fixed_vocabulary_
+
+
+class HashingVectorizer(BaseEstimator, VectorizerMixin):
+    """Convert a collection of text documents to a matrix of token occurrences
+
+    It turns a collection of text documents into a scipy.sparse matrix holding
+    token occurrence counts (or binary occurrence information), possibly
+    normalized as token frequencies if norm='l1' or projected on the euclidean
+    unit sphere if norm='l2'.
+
+    This text vectorizer implementation uses the hashing trick to find the
+    token string name to feature integer index mapping.
+
+    This strategy has several advantages:
+
+    - it is very low memory scalable to large datasets as there is no need to
+      store a vocabulary dictionary in memory
+
+    - it is fast to pickle and un-pickle as it holds no state besides the
+      constructor parameters
+
+    - it can be used in a streaming (partial fit) or parallel pipeline as there
+      is no state computed during fit.
+
+    There are also a couple of cons (vs using a CountVectorizer with an
+    in-memory vocabulary):
+
+    - there is no way to compute the inverse transform (from feature indices to
+      string feature names) which can be a problem when trying to introspect
+      which features are most important to a model.
+
+    - there can be collisions: distinct tokens can be mapped to the same
+      feature index. However in practice this is rarely an issue if n_features
+      is large enough (e.g. 2 ** 18 for text classification problems).
+
+    - no IDF weighting as this would render the transformer stateful.
+
+    The hash function employed is the signed 32-bit version of Murmurhash3.
+
+    Parameters
+    ----------
+
+    input: string {'filename', 'file', 'content'}
+        If 'filename', the sequence passed as an argument to fit is
+        expected to be a list of filenames that need reading to fetch
+        the raw content to analyze.
+
+        If 'file', the sequence items must have a 'read' method (file-like
+        object) that is called to fetch the bytes in memory.
+
+        Otherwise the input is expected to be the sequence strings or
+        bytes items are expected to be analyzed directly.
+
+    encoding : string, 'utf-8' by default.
+        If bytes or files are given to analyze, this encoding is used to
+        decode.
+
+    decode_error : {'strict', 'ignore', 'replace'}
+        Instruction on what to do if a byte sequence is given to analyze that
+        contains characters not of the given `encoding`. By default, it is
+        'strict', meaning that a UnicodeDecodeError will be raised. Other
+        values are 'ignore' and 'replace'.
+
+    strip_accents: {'ascii', 'unicode', None}
+        Remove accents during the preprocessing step.
+        'ascii' is a fast method that only works on characters that have
+        an direct ASCII mapping.
+        'unicode' is a slightly slower method that works on any characters.
+        None (default) does nothing.
+
+    analyzer: string, {'word', 'char', 'char_wb'} or callable
+        Whether the feature should be made of word or character n-grams.
+        Option 'char_wb' creates character n-grams only from text inside
+        word boundaries.
+
+        If a callable is passed it is used to extract the sequence of features
+        out of the raw, unprocessed input.
+
+    preprocessor: callable or None (default)
+        Override the preprocessing (string transformation) stage while
+        preserving the tokenizing and n-grams generation steps.
+
+    tokenizer: callable or None (default)
+        Override the string tokenization step while preserving the
+        preprocessing and n-grams generation steps.
+
+    ngram_range: tuple (min_n, max_n)
+        The lower and upper boundary of the range of n-values for different
+        n-grams to be extracted. All values of n such that min_n <= n <= max_n
+        will be used.
+
+    stop_words: string {'english'}, list, or None (default)
+        If 'english', a built-in stop word list for English is used.
+
+        If a list, that list is assumed to contain stop words, all of which
+        will be removed from the resulting tokens.
+
+    lowercase: boolean, default True
+        Convert all characters to lowercase before tokenizing.
+
+    token_pattern: string
+        Regular expression denoting what constitutes a "token", only used
+        if `analyzer == 'word'`. The default regexp selects tokens of 2
+        or more alphanumeric characters (punctuation is completely ignored
+        and always treated as a token separator).
+
+    n_features : integer, optional, (2 ** 20) by default
+        The number of features (columns) in the output matrices. Small numbers
+        of features are likely to cause hash collisions, but large numbers
+        will cause larger coefficient dimensions in linear learners.
+
+    norm : 'l1', 'l2' or None, optional
+        Norm used to normalize term vectors. None for no normalization.
+
+    binary: boolean, False by default.
+        If True, all non zero counts are set to 1. This is useful for discrete
+        probabilistic models that model binary events rather than integer
+        counts.
+
+    dtype: type, optional
+        Type of the matrix returned by fit_transform() or transform().
+
+    non_negative : boolean, optional
+        Whether output matrices should contain non-negative values only;
+        effectively calls abs on the matrix prior to returning it.
+        When True, output values can be interpreted as frequencies.
+        When False, output values will have expected value zero.
+
+    See also
+    --------
+    CountVectorizer, TfidfVectorizer
+
+    """
+
+    def __init__(self, input='content', encoding='utf-8',
+                 decode_error='strict', strip_accents=None,
+                 lowercase=True, preprocessor=None, tokenizer=None,
+                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
+                 ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20),
+                 binary=False, norm='l2', non_negative=False,
+                 dtype=np.float64):
+        self.input = input
+        self.encoding = encoding
+        self.decode_error = decode_error
+        self.strip_accents = strip_accents
+        self.preprocessor = preprocessor
+        self.tokenizer = tokenizer
+        self.analyzer = analyzer
+        self.lowercase = lowercase
+        self.token_pattern = token_pattern
+        self.stop_words = stop_words
+        self.n_features = n_features
+        self.ngram_range = ngram_range
+        self.binary = binary
+        self.norm = norm
+        self.non_negative = non_negative
+        self.dtype = dtype
+
+    def partial_fit(self, X, y=None):
+        """Does nothing: this transformer is stateless.
+
+        This method is just there to mark the fact that this transformer
+        can work in a streaming setup.
+
+        """
+        return self
+
+    def fit(self, X, y=None):
+        """Does nothing: this transformer is stateless."""
+        # triggers a parameter validation
+        self._get_hasher().fit(X, y=y)
+        return self
+
+    def transform(self, X, y=None):
+        """Transform a sequence of documents to a document-term matrix.
+
+        Parameters
+        ----------
+        X : iterable over raw text documents, length = n_samples
+            Samples. Each sample must be a text document (either bytes or
+            unicode strings, file name or file object depending on the
+            constructor argument) which will be tokenized and hashed.
+
+        y : (ignored)
+
+        Returns
+        -------
+        X : scipy.sparse matrix, shape = (n_samples, self.n_features)
+            Document-term matrix.
+
+        """
+        analyzer = self.build_analyzer()
+        X = self._get_hasher().transform(analyzer(doc) for doc in X)
+        if self.binary:
+            X.data.fill(1)
+        if self.norm is not None:
+            X = normalize(X, norm=self.norm, copy=False)
+        return X
+
+    # Alias transform to fit_transform for convenience
+    fit_transform = transform
+
+    def _get_hasher(self):
+        return FeatureHasher(n_features=self.n_features,
+                             input_type='string', dtype=self.dtype,
+                             non_negative=self.non_negative)
+
+
+def _add_sparse_column(sparse,column):
+    import itertools
+    addition = sp.lil_matrix(sparse.shape)
+    sparse_coo = sparse.tocoo()
+    for i,j,v in itertools.izip(sparse_coo.row, sparse_coo.col, sparse_coo.data):
+        addition[i,j] = v + column[i,0]
+    return addition.tocsr()
+
+def _class_frequencies(X, y):
+    """Count the number of non-zero values for each class y in sparse X."""
+
+    labels = np.unique(y)
+    if len(labels) > 2:
+        raise ValueError("Delta works only with binary classification problems")
+
+    # Indices for each type of labels in y
+    N1 = np.where(y == labels[0])[0]
+    N2 = np.where(y == labels[1])[0]
+
+    # Number of positive documents that each term appears on
+    df1 = np.bincount(X[N1].nonzero()[1], minlength=X.shape[1])
+    # Number of negative documents that each term appears on
+    df2 = np.bincount(X[N2].nonzero()[1], minlength=X.shape[1])
+
+    return N1.shape[0], df1, N2.shape[0], df2
+
+
+def _document_frequency(X):
+    """Count the number of non-zero values for each feature in sparse X."""
+    if sp.isspmatrix_csr(X):
+        return np.bincount(X.indices, minlength=X.shape[1])
+    else:
+        return np.diff(sp.csc_matrix(X, copy=False).indptr)
+
+
+class CountVectorizer(BaseEstimator, VectorizerMixin):
+    """Convert a collection of text documents to a matrix of token counts
+
+    This implementation produces a sparse representation of the counts using
+    scipy.sparse.coo_matrix.
+
+    If you do not provide an a-priori dictionary and you do not use an analyzer
+    that does some kind of feature selection then the number of features will
+    be equal to the vocabulary size found by analyzing the data.
+
+    Parameters
+    ----------
+    input : string {'filename', 'file', 'content'}
+        If 'filename', the sequence passed as an argument to fit is
+        expected to be a list of filenames that need reading to fetch
+        the raw content to analyze.
+
+        If 'file', the sequence items must have a 'read' method (file-like
+        object) that is called to fetch the bytes in memory.
+
+        Otherwise the input is expected to be the sequence strings or
+        bytes items are expected to be analyzed directly.
+
+    encoding : string, 'utf-8' by default.
+        If bytes or files are given to analyze, this encoding is used to
+        decode.
+
+    decode_error : {'strict', 'ignore', 'replace'}
+        Instruction on what to do if a byte sequence is given to analyze that
+        contains characters not of the given `encoding`. By default, it is
+        'strict', meaning that a UnicodeDecodeError will be raised. Other
+        values are 'ignore' and 'replace'.
+
+    strip_accents : {'ascii', 'unicode', None}
+        Remove accents during the preprocessing step.
+        'ascii' is a fast method that only works on characters that have
+        an direct ASCII mapping.
+        'unicode' is a slightly slower method that works on any characters.
+        None (default) does nothing.
+
+    analyzer : string, {'word', 'char', 'char_wb'} or callable
+        Whether the feature should be made of word or character n-grams.
+        Option 'char_wb' creates character n-grams only from text inside
+        word boundaries.
+
+        If a callable is passed it is used to extract the sequence of features
+        out of the raw, unprocessed input.
+
+    preprocessor : callable or None (default)
+        Override the preprocessing (string transformation) stage while
+        preserving the tokenizing and n-grams generation steps.
+
+    tokenizer : callable or None (default)
+        Override the string tokenization step while preserving the
+        preprocessing and n-grams generation steps.
+
+    ngram_range : tuple (min_n, max_n)
+        The lower and upper boundary of the range of n-values for different
+        n-grams to be extracted. All values of n such that min_n <= n <= max_n
+        will be used.
+
+    stop_words : string {'english'}, list, or None (default)
+        If 'english', a built-in stop word list for English is used.
+
+        If a list, that list is assumed to contain stop words, all of which
+        will be removed from the resulting tokens.
+
+        If None, no stop words will be used. max_df can be set to a value
+        in the range [0.7, 1.0) to automatically detect and filter stop
+        words based on intra corpus document frequency of terms.
+
+    lowercase : boolean, True by default
+        Convert all characters to lowercase before tokenizing.
+
+    token_pattern : string
+        Regular expression denoting what constitutes a "token", only used
+        if `tokenize == 'word'`. The default regexp select tokens of 2
+        or more alphanumeric characters (punctuation is completely ignored
+        and always treated as a token separator).
+
+    max_df : float in range [0.0, 1.0] or int, optional, 1.0 by default
+        When building the vocabulary ignore terms that have a document
+        frequency strictly higher than the given threshold (corpus-specific
+        stop words).
+        If float, the parameter represents a proportion of documents, integer
+        absolute counts.
+        This parameter is ignored if vocabulary is not None.
+
+    min_df : float in range [0.0, 1.0] or int, optional, 1 by default
+        When building the vocabulary ignore terms that have a document
+        frequency strictly lower than the given threshold. This value is also
+        called cut-off in the literature.
+        If float, the parameter represents a proportion of documents, integer
+        absolute counts.
+        This parameter is ignored if vocabulary is not None.
+
+    max_features : optional, None by default
+        If not None, build a vocabulary that only consider the top
+        max_features ordered by term frequency across the corpus.
+
+        This parameter is ignored if vocabulary is not None.
+
+    vocabulary : Mapping or iterable, optional
+        Either a Mapping (e.g., a dict) where keys are terms and values are
+        indices in the feature matrix, or an iterable over terms. If not
+        given, a vocabulary is determined from the input documents. Indices
+        in the mapping should not be repeated and should not have any gap
+        between 0 and the largest index.
+
+    binary : boolean, False by default.
+        If True, all non zero counts are set to 1. This is useful for discrete
+        probabilistic models that model binary events rather than integer
+        counts.
+
+    dtype : type, optional
+        Type of the matrix returned by fit_transform() or transform().
+
+    Attributes
+    ----------
+    vocabulary_ : dict
+        A mapping of terms to feature indices.
+
+    stop_words_ : set
+        Terms that were ignored because
+        they occurred in either too many
+        (`max_df`) or in too few (`min_df`) documents.
+        This is only available if no vocabulary was given.
+
+    See also
+    --------
+    HashingVectorizer, TfidfVectorizer
+    """
+
+    def __init__(self, input='content', encoding='utf-8',
+                 decode_error='strict', strip_accents=None,
+                 lowercase=True, preprocessor=None, tokenizer=None,
+                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
+                 ngram_range=(1, 1), analyzer='word',
+                 max_df=1.0, min_df=1, max_features=None,
+                 vocabulary=None, binary=False, dtype=np.int64):
+        self.input = input
+        self.encoding = encoding
+        self.decode_error = decode_error
+        self.strip_accents = strip_accents
+        self.preprocessor = preprocessor
+        self.tokenizer = tokenizer
+        self.analyzer = analyzer
+        self.lowercase = lowercase
+        self.token_pattern = token_pattern
+        self.stop_words = stop_words
+        self.max_df = max_df
+        self.min_df = min_df
+        if max_df < 0 or min_df < 0:
+            raise ValueError("negative value for max_df of min_df")
+        self.max_features = max_features
+        if max_features is not None:
+            if (not isinstance(max_features, numbers.Integral) or
+                        max_features <= 0):
+                raise ValueError(
+                    "max_features=%r, neither a positive integer nor None"
+                    % max_features)
+        self.ngram_range = ngram_range
+        self.vocabulary = vocabulary
+        self.binary = binary
+        self.dtype = dtype
+
+    def _sort_features(self, X, vocabulary):
+        """Sort features by name
+
+        Returns a reordered matrix and modifies the vocabulary in place
+        """
+        sorted_features = sorted(six.iteritems(vocabulary))
+        map_index = np.empty(len(sorted_features), dtype=np.int32)
+        for new_val, (term, old_val) in enumerate(sorted_features):
+            map_index[new_val] = old_val
+            vocabulary[term] = new_val
+        return X[:, map_index]
+
+    def _limit_features(self, X, vocabulary, high=None, low=None,
+                        limit=None):
+        """Remove too rare or too common features.
+
+        Prune features that are non zero in more samples than high or less
+        documents than low, modifying the vocabulary, and restricting it to
+        at most the limit most frequent.
+
+        This does not prune samples with zero features.
+        """
+        if high is None and low is None and limit is None:
+            return X, set()
+
+        # Calculate a mask based on document frequencies
+        dfs = _document_frequency(X)
+        tfs = np.asarray(X.sum(axis=0)).ravel()
+        mask = np.ones(len(dfs), dtype=bool)
+        if high is not None:
+            mask &= dfs <= high
+        if low is not None:
+            mask &= dfs >= low
+        if limit is not None and mask.sum() > limit:
+            mask_inds = (-tfs[mask]).argsort()[:limit]
+            new_mask = np.zeros(len(dfs), dtype=bool)
+            new_mask[np.where(mask)[0][mask_inds]] = True
+            mask = new_mask
+
+        new_indices = np.cumsum(mask) - 1  # maps old indices to new
+        removed_terms = set()
+        for term, old_index in list(six.iteritems(vocabulary)):
+            if mask[old_index]:
+                vocabulary[term] = new_indices[old_index]
+            else:
+                del vocabulary[term]
+                removed_terms.add(term)
+        kept_indices = np.where(mask)[0]
+        if len(kept_indices) == 0:
+            raise ValueError("After pruning, no terms remain. Try a lower"
+                             " min_df or a higher max_df.")
+        return X[:, kept_indices], removed_terms
+
+    def _count_vocab(self, raw_documents, fixed_vocab):
+        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
+        """
+        if fixed_vocab:
+            vocabulary = self.vocabulary_
+        else:
+            # Add a new value when a new vocabulary item is seen
+            vocabulary = defaultdict()
+            vocabulary.default_factory = vocabulary.__len__
+
+        analyze = self.build_analyzer()
+        j_indices = _make_int_array()
+        indptr = _make_int_array()
+        indptr.append(0)
+        for doc in raw_documents:
+            for feature in analyze(doc):
+                try:
+                    j_indices.append(vocabulary[feature])
+                except KeyError:
+                    # Ignore out-of-vocabulary items for fixed_vocab=True
+                    continue
+            indptr.append(len(j_indices))
+
+        if not fixed_vocab:
+            # disable defaultdict behaviour
+            vocabulary = dict(vocabulary)
+            if not vocabulary:
+                raise ValueError("empty vocabulary; perhaps the documents only"
+                                 " contain stop words")
+
+        # some Python/Scipy versions won't accept an array.array:
+        if j_indices:
+            j_indices = np.frombuffer(j_indices, dtype=np.intc)
+        else:
+            j_indices = np.array([], dtype=np.int32)
+        indptr = np.frombuffer(indptr, dtype=np.intc)
+        values = np.ones(len(j_indices))
+
+        X = sp.csr_matrix((values, j_indices, indptr),
+                          shape=(len(indptr) - 1, len(vocabulary)),
+                          dtype=self.dtype)
+        X.sum_duplicates()
+        return vocabulary, X
+
+    def fit(self, raw_documents, y=None):
+        """Learn a vocabulary dictionary of all tokens in the raw documents.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable which yields either str, unicode or file objects.
+
+        Returns
+        -------
+        self
+        """
+        self.fit_transform(raw_documents)
+        return self
+
+    def fit_transform(self, raw_documents, y=None):
+        """Learn the vocabulary dictionary and return term-document matrix.
+
+        This is equivalent to fit followed by transform, but more efficiently
+        implemented.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable which yields either str, unicode or file objects.
+
+        Returns
+        -------
+        X : array, [n_samples, n_features]
+            Document-term matrix.
+        """
+        # We intentionally don't call the transform method to make
+        # fit_transform overridable without unwanted side effects in
+        # TfidfVectorizer.
+        self._check_vocabulary()
+        max_df = self.max_df
+        min_df = self.min_df
+        max_features = self.max_features
+
+        vocabulary, X = self._count_vocab(raw_documents,
+                                          self.fixed_vocabulary_)
+
+        if self.binary:
+            X.data.fill(1)
+
+        if not self.fixed_vocabulary_:
+            X = self._sort_features(X, vocabulary)
+
+            n_doc = X.shape[0]
+            max_doc_count = (max_df
+                             if isinstance(max_df, numbers.Integral)
+                             else max_df * n_doc)
+            min_doc_count = (min_df
+                             if isinstance(min_df, numbers.Integral)
+                             else min_df * n_doc)
+            if max_doc_count < min_doc_count:
+                raise ValueError(
+                    "max_df corresponds to < documents than min_df")
+            X, self.stop_words_ = self._limit_features(X, vocabulary,
+                                                       max_doc_count,
+                                                       min_doc_count,
+                                                       max_features)
+
+            self.vocabulary_ = vocabulary
+
+        return X
+
+    def transform(self, raw_documents):
+        """Transform documents to document-term matrix.
+
+        Extract token counts out of raw text documents using the vocabulary
+        fitted with fit or the one provided to the constructor.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            An iterable which yields either str, unicode or file objects.
+
+        Returns
+        -------
+        X : sparse matrix, [n_samples, n_features]
+            Document-term matrix.
+        """
+        if not hasattr(self, 'vocabulary_'):
+            self._check_vocabulary()
+
+        if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:
+            raise ValueError("Vocabulary wasn't fitted or is empty!")
+
+        # use the same matrix-building strategy as fit_transform
+        _, X = self._count_vocab(raw_documents, fixed_vocab=True)
+        if self.binary:
+            X.data.fill(1)
+        return X
+
+    def inverse_transform(self, X):
+        """Return terms per document with nonzero entries in X.
+
+        Parameters
+        ----------
+        X : {array, sparse matrix}, shape = [n_samples, n_features]
+
+        Returns
+        -------
+        X_inv : list of arrays, len = n_samples
+            List of arrays of terms.
+        """
+        if sp.issparse(X):
+            # We need CSR format for fast row manipulations.
+            X = X.tocsr()
+        else:
+            # We need to convert X to a matrix, so that the indexing
+            # returns 2D objects
+            X = np.asmatrix(X)
+        n_samples = X.shape[0]
+
+        terms = np.array(list(self.vocabulary_.keys()))
+        indices = np.array(list(self.vocabulary_.values()))
+        inverse_vocabulary = terms[np.argsort(indices)]
+
+        return [inverse_vocabulary[X[i, :].nonzero()[1]].ravel()
+                for i in range(n_samples)]
+
+    def get_feature_names(self):
+        """Array mapping from feature integer indices to feature name"""
+        if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:
+            raise ValueError("Vocabulary wasn't fitted or is empty!")
+
+        return [t for t, i in sorted(six.iteritems(self.vocabulary_),
+                                     key=itemgetter(1))]
+
+
+def _make_int_array():
+    """Construct an array.array of a type suitable for scipy.sparse indices."""
+    return array.array(str("i"))
+
+
+class TfidfTransformer(BaseEstimator, TransformerMixin):
+    """Transform a count matrix to a normalized tf or tf-idf representation
+
+    Tf means term-frequency while tf-idf means term-frequency times inverse
+    document-frequency. This is a common term weighting scheme in information
+    retrieval, that has also found good use in document classification.
+
+    The goal of using tf-idf instead of the raw frequencies of occurrence of a
+    token in a given document is to scale down the impact of tokens that occur
+    very frequently in a given corpus and that are hence empirically less
+    informative than features that occur in a small fraction of the training
+    corpus.
+
+    The actual formula used for tf-idf is tf * (idf + 1) = tf + tf * idf,
+    instead of tf * idf. The effect of this is that terms with zero idf, i.e.
+    that occur in all documents of a training set, will not be entirely
+    ignored. The formulas used to compute tf and idf depend on parameter
+    settings that correspond to the SMART notation used in IR, as follows:
+
+    Tf is "n" (natural) by default, "l" (logarithmic) when sublinear_tf=True.
+    Idf is "t" when use_idf is given, "n" (none) otherwise.
+    Normalization is "c" (cosine) when norm='l2', "n" (none) when norm=None.
+
+    Parameters
+    ----------
+    norm : 'l1', 'l2' or None, optional
+        Norm used to normalize term vectors. None for no normalization.
+
+    use_idf : boolean, optional
+        Enable inverse-document-frequency reweighting.
+
+    smooth_idf : boolean, optional
+        Smooth idf weights by adding one to document frequencies, as if an
+        extra document was seen containing every term in the collection
+        exactly once. Prevents zero divisions.
+
+    sublinear_tf : boolean, optional
+        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
+
+    References
+    ----------
+
+    .. [Yates2011] `R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern
+                   Information Retrieval. Addison Wesley, pp. 68-74.`
+
+    .. [MRS2008] `C.D. Manning, P. Raghavan and H. Schuetze  (2008).
+                   Introduction to Information Retrieval. Cambridge University
+                   Press, pp. 118-120.`
+    """
+
+    def __init__(self, norm='l2', use_idf=True, use_bm25idf=False, smooth_idf=True,
+                 delta_idf=False, sublinear_tf=False, bm25_tf=False):
+        self.norm = norm
+        self.use_idf = use_idf
+        self.use_bm25idf = use_bm25idf
+        self.smooth_idf = smooth_idf
+        # Required for delta idf's
+        self.delta_idf = delta_idf
+
+        self.sublinear_tf = sublinear_tf
+        self.bm25_tf = bm25_tf
+        self.k = 1.2
+        self.b = 0.95
+
+    def fit(self, X, y=None):
+        """Learn the idf vector (global term weights)
+
+        Parameters
+        ----------
+        X : sparse matrix, [n_samples, n_features]
+            a matrix of term/token counts
+        """
+        if not sp.issparse(X):
+            X = sp.csc_matrix(X)
+
+        if self.use_idf:
+            n_samples, n_features = X.shape
+
+            # BM25 idf
+            if self.use_bm25idf:            
+            	if self.delta_idf:
+                	if y is None:
+                		raise ValueError("Labels are needed to determine Delta idf")
+
+	                N1, df1, N2, df2 = _class_frequencies(X, y)
+	                delta_bm25idf = np.log(((N1 - df1 + 0.5) * df2 + 0.5) / ((N2 - df2 + 0.5) * df1 + 0.5))
+	                self._idf_diag = sp.spdiags(delta_bm25idf,
+                                            diags=0, m=n_features, n=n_features)
+            	else:
+					# vanilla bm25 idf
+                	df = _document_frequency(X)
+
+	                # perform idf smoothing if required
+	                df += int(self.smooth_idf)
+	                n_samples += int(self.smooth_idf)
+
+	                # log1p instead of log makes sure terms with zero idf don't get
+	                # suppressed entirely
+
+	                bm25idf = np.log((n_samples - df + 0.5) / (df + 0.5))
+	                self._idf_diag = sp.spdiags(bm25idf,
+	                                            diags=0, m=n_features, n=n_features)
+
+	        # Vanilla idf
+            elif self.delta_idf:
+                if y is None:
+                    raise ValueError("Labels are needed to determine Delta idf")
+
+                N1, df1, N2, df2 = _class_frequencies(X, y)
+                delta_idf = np.log((df1 * float(N2) + int(self.smooth_idf)) /
+                                   (df2 * N1 + int(self.smooth_idf)))
+
+		        # Maybe scale delta_idf to only positive values (for Naive Bayes, etc) ?
+                self._idf_diag = sp.spdiags(delta_idf,
+                                            diags=0, m=n_features, n=n_features)
+
+            else:
+                df = _document_frequency(X)
+
+                # perform idf smoothing if required
+                df += int(self.smooth_idf)
+                n_samples += int(self.smooth_idf)
+
+                # log1p instead of log makes sure terms with zero idf don't get
+                # suppressed entirely
+                idf = np.log(float(n_samples) / df) + 1.0
+                self._idf_diag = sp.spdiags(idf,
+                                            diags=0, m=n_features, n=n_features)
+
+
+    	return self
+
+    def transform(self, X, copy=True):
+        """Transform a count matrix to a tf or tf-idf representation
+
+        Parameters
+        ----------
+        X : sparse matrix, [n_samples, n_features]
+            a matrix of term/token counts
+
+        Returns
+        -------
+        vectors : sparse matrix, [n_samples, n_features]
+        """
+        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
+            # preserve float family dtype
+            X = sp.csr_matrix(X, copy=copy)
+        else:
+            # convert counts or binary occurrences to floats
+            X = sp.csr_matrix(X, dtype=np.float64, copy=copy)
+
+        n_samples, n_features = X.shape
+
+        if self.bm25_tf:
+
+            # First calculate the denominator of BM25 equation
+            # Sum the frequencies (sum of each row) to get the documents lengths
+            D = (X.sum(1) / np.average(X.sum(1))).reshape((n_samples, 1))
+            D = ((1 - self.b) + self.b * D) * self.k
+            # D = sp.csr_matrix(np.multiply(np.ones((n_samples,n_features)),D))
+            D_X =  _add_sparse_column(X,D)
+
+            # Then perform the main division
+            # ...Find a better way to add a numpy ndarray to a sparse matrix
+            np.divide(X.data * (self.k + 1), D_X.data, X.data)
+            # np.divide(X.data * (self.k + 1), sp.csr_matrix(np.add(X.todense(), D)).data, X.data)
+
+        elif self.sublinear_tf:
+            np.log(X.data, X.data)
+            X.data += 1
+
+        if self.use_idf:
+            if not hasattr(self, "_idf_diag"):
+                raise ValueError("idf vector not fitted")
+            expected_n_features = self._idf_diag.shape[0]
+            if n_features != expected_n_features:
+                raise ValueError("Input has n_features=%d while the model"
+                                 " has been trained with n_features=%d" % (
+                                     n_features, expected_n_features))
+            # *= doesn't work
+            X = X * self._idf_diag
+
+        if self.norm:
+            X = normalize(X, norm=self.norm, copy=False)
+
+        return X
+
+    @property
+    def idf_(self):
+        if hasattr(self, "_idf_diag"):
+            return np.ravel(self._idf_diag.sum(axis=0))
+        else:
+            return None
+
+
+class TfidfVectorizer(CountVectorizer):
+    """Convert a collection of raw documents to a matrix of TF-IDF features.
+
+    Equivalent to CountVectorizer followed by TfidfTransformer.
+
+    Parameters
+    ----------
+    input : string {'filename', 'file', 'content'}
+        If 'filename', the sequence passed as an argument to fit is
+        expected to be a list of filenames that need reading to fetch
+        the raw content to analyze.
+
+        If 'file', the sequence items must have a 'read' method (file-like
+        object) that is called to fetch the bytes in memory.
+
+        Otherwise the input is expected to be the sequence strings or
+        bytes items are expected to be analyzed directly.
+
+    encoding : string, 'utf-8' by default.
+        If bytes or files are given to analyze, this encoding is used to
+        decode.
+
+    decode_error : {'strict', 'ignore', 'replace'}
+        Instruction on what to do if a byte sequence is given to analyze that
+        contains characters not of the given `encoding`. By default, it is
+        'strict', meaning that a UnicodeDecodeError will be raised. Other
+        values are 'ignore' and 'replace'.
+
+    strip_accents : {'ascii', 'unicode', None}
+        Remove accents during the preprocessing step.
+        'ascii' is a fast method that only works on characters that have
+        an direct ASCII mapping.
+        'unicode' is a slightly slower method that works on any characters.
+        None (default) does nothing.
+
+    analyzer : string, {'word', 'char'} or callable
+        Whether the feature should be made of word or character n-grams.
+
+        If a callable is passed it is used to extract the sequence of features
+        out of the raw, unprocessed input.
+
+    preprocessor : callable or None (default)
+        Override the preprocessing (string transformation) stage while
+        preserving the tokenizing and n-grams generation steps.
+
+    tokenizer : callable or None (default)
+        Override the string tokenization step while preserving the
+        preprocessing and n-grams generation steps.
+
+    ngram_range : tuple (min_n, max_n)
+        The lower and upper boundary of the range of n-values for different
+        n-grams to be extracted. All values of n such that min_n <= n <= max_n
+        will be used.
+
+    stop_words : string {'english'}, list, or None (default)
+        If a string, it is passed to _check_stop_list and the appropriate stop
+        list is returned. 'english' is currently the only supported string
+        value.
+
+        If a list, that list is assumed to contain stop words, all of which
+        will be removed from the resulting tokens.
+
+        If None, no stop words will be used. max_df can be set to a value
+        in the range [0.7, 1.0) to automatically detect and filter stop
+        words based on intra corpus document frequency of terms.
+
+    lowercase : boolean, default True
+        Convert all characters to lowercase before tokenizing.
+
+    token_pattern : string
+        Regular expression denoting what constitutes a "token", only used
+        if `analyzer == 'word'`. The default regexp selects tokens of 2
+        or more alphanumeric characters (punctuation is completely ignored
+        and always treated as a token separator).
+
+    max_df : float in range [0.0, 1.0] or int, optional, 1.0 by default
+        When building the vocabulary ignore terms that have a term frequency
+        strictly higher than the given threshold (corpus specific stop words).
+        If float, the parameter represents a proportion of documents, integer
+        absolute counts.
+        This parameter is ignored if vocabulary is not None.
+
+    min_df : float in range [0.0, 1.0] or int, optional, 1 by default
+        When building the vocabulary ignore terms that have a term frequency
+        strictly lower than the given threshold.
+        This value is also called cut-off in the literature.
+        If float, the parameter represents a proportion of documents, integer
+        absolute counts.
+        This parameter is ignored if vocabulary is not None.
+
+    max_features : optional, None by default
+        If not None, build a vocabulary that only consider the top
+        max_features ordered by term frequency across the corpus.
+
+        This parameter is ignored if vocabulary is not None.
+
+    vocabulary : Mapping or iterable, optional
+        Either a Mapping (e.g., a dict) where keys are terms and values are
+        indices in the feature matrix, or an iterable over terms. If not
+        given, a vocabulary is determined from the input documents.
+
+    binary : boolean, False by default.
+        If True, all non-zero term counts are set to 1. This does not mean
+        outputs will have only 0/1 values, only that the tf term in tf-idf
+        is binary. (Set idf and normalization to False to get 0/1 outputs.)
+
+    dtype : type, optional
+        Type of the matrix returned by fit_transform() or transform().
+
+    norm : 'l1', 'l2' or None, optional
+        Norm used to normalize term vectors. None for no normalization.
+
+    use_idf : boolean, optional
+        Enable inverse-document-frequency reweighting.
+
+    smooth_idf : boolean, optional
+        Smooth idf weights by adding one to document frequencies, as if an
+        extra document was seen containing every term in the collection
+        exactly once. Prevents zero divisions.
+
+    sublinear_tf : boolean, optional
+        Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
+
+    Attributes
+    ----------
+    idf_ : array, shape = [n_features], or None
+        The learned idf vector (global term weights)
+        when ``use_idf`` is set to True, None otherwise.
+
+    See also
+    --------
+    CountVectorizer
+        Tokenize the documents and count the occurrences of token and return
+        them as a sparse matrix
+
+    TfidfTransformer
+        Apply Term Frequency Inverse Document Frequency normalization to a
+        sparse matrix of occurrence counts.
+
+    """
+
+    def __init__(self, input='content', encoding='utf-8',
+                 decode_error='strict', strip_accents=None, lowercase=True,
+                 preprocessor=None, tokenizer=None, analyzer='word',
+                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
+                 ngram_range=(1, 1), max_df=1.0, min_df=1,
+                 max_features=None, vocabulary=None, binary=False,
+                 dtype=np.int64, norm='l2', use_idf=True, use_bm25idf = False,
+                 smooth_idf=True, delta_idf=False, sublinear_tf=False, bm25_tf=False):
+        super(TfidfVectorizer, self).__init__(
+            input=input, encoding=encoding, decode_error=decode_error,
+            strip_accents=strip_accents, lowercase=lowercase,
+            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
+            stop_words=stop_words, token_pattern=token_pattern,
+            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
+            max_features=max_features, vocabulary=vocabulary, binary=binary,
+            dtype=dtype)
+
+        self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
+                                       use_bm25idf=use_bm25idf,
+                                       smooth_idf=smooth_idf,
+                                       delta_idf=delta_idf,
+                                       sublinear_tf=sublinear_tf,
+                                       bm25_tf=bm25_tf)
+
+    # Broadcast the TF-IDF parameters to the underlying transformer instance
+    # for easy grid search and repr
+
+    @property
+    def norm(self):
+        return self._tfidf.norm
+
+    @norm.setter
+    def norm(self, value):
+        self._tfidf.norm = value
+
+    @property
+    def use_idf(self):
+        return self._tfidf.use_idf
+
+    @use_idf.setter
+    def use_idf(self, value):
+        self._tfidf.use_idf = value
+
+    @property
+    def smooth_idf(self):
+        return self._tfidf.smooth_idf
+
+    @smooth_idf.setter
+    def smooth_idf(self, value):
+        self._tfidf.smooth_idf = value
+
+    @property
+    def sublinear_tf(self):
+        return self._tfidf.sublinear_tf
+
+    @sublinear_tf.setter
+    def sublinear_tf(self, value):
+        self._tfidf.sublinear_tf = value
+
+    @property
+    def idf_(self):
+        return self._tfidf.idf_
+
+    def fit(self, raw_documents, y=None):
+        """Learn vocabulary and idf from training set.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            an iterable which yields either str, unicode or file objects
+
+        Returns
+        -------
+        self : TfidfVectorizer
+        """
+        X = super(TfidfVectorizer, self).fit_transform(raw_documents)
+        self._tfidf.fit(X)
+        return self
+
+    def fit_transform(self, raw_documents, y=None):
+        """Learn vocabulary and idf, return term-document matrix.
+
+        This is equivalent to fit followed by transform, but more efficiently
+        implemented.
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            an iterable which yields either str, unicode or file objects
+
+        Returns
+        -------
+        X : sparse matrix, [n_samples, n_features]
+            Tf-idf-weighted document-term matrix.
+        """
+        X = super(TfidfVectorizer, self).fit_transform(raw_documents)
+        self._tfidf.fit(X,y)
+        # X is already a transformed view of raw_documents so
+        # we set copy to False
+        return self._tfidf.transform(X, copy=False)
+
+    def transform(self, raw_documents, copy=True):
+        """Transform documents to document-term matrix.
+
+        Uses the vocabulary and document frequencies (df) learned by fit (or
+        fit_transform).
+
+        Parameters
+        ----------
+        raw_documents : iterable
+            an iterable which yields either str, unicode or file objects
+
+        Returns
+        -------
+        X : sparse matrix, [n_samples, n_features]
+            Tf-idf-weighted document-term matrix.
+        """
+        X = super(TfidfVectorizer, self).transform(raw_documents)
+        return self._tfidf.transform(X, copy=False)
+
No results found