import re # http://atomboy.isa-geek.com/plone/Members/acoil/programing/double-metaphone from metaphone import dm as double_metaphone # get the Redis connection from jellybean.core import redis import models # Words which should not be indexed STOP_WORDS = ("the", "of", "to", "and", "a", "in", "is", "it", "you", "that") # Do not index any words shorter than this MIN_WORD_LENGTH = 3 # Consider these characters to be punctuation (they will be replaced with spaces prior to word extraction) PUNCTUATION_CHARS = ".,;:!?@£$%^&*()-–<>[]{}\\|/`~'\"" # A redis key to store a list of metaphones present in this project REDIS_KEY_METAPHONES = "project_id:%(project_id)d:fulltext_search:metaphones" # A redis key to store a list of item IDs which have the given metaphone within the given project REDIS_KEY_METAPHONE = "project_id:%(project_id)d:fulltext_search:metaphone:%(metaphone)s" class FullTextIndex(object): """A class to provide full-text indexing functionality using Redis""" def __init__(self): self.punctuation_regex = re.compile(r"[%s]" % re.escape(PUNCTUATION_CHARS)) super(FullTextIndex, self).__init__() def get_words_from_text(self, text): """Extract a list of words to index from the given text""" if not text: return [] text = self.punctuation_regex.sub(" ", text) words = text.split() words = [word for word in text.split() if len(word) >= MIN_WORD_LENGTH and word.lower() not in STOP_WORDS] return words def index_item(self, item): """Extract content from the given item and add it to the index""" # TODO: Added item users to index words = self.get_words_from_text(item.subject) words += self.get_words_from_text(item.body) words += self.get_words_from_text(item.milestone.name) words += self.get_words_from_text(item.type_name) words += self.get_words_from_text(" ".join(item.tags)) metaphones = self.get_metaphones(words) for metaphone in metaphones: self._link_item_and_metaphone(item, metaphone) def index_item_content(self, item, content): """Index a specific bit of item content""" words = self.get_words_from_text(content) metaphones = self.get_metaphones(words) for metaphone in metaphones: self._link_item_and_metaphone(item, metaphone) def _link_item_and_metaphone(self, item, metaphone): # Add the item to the metaphone key redis_key = REDIS_KEY_METAPHONE % {"project_id": item.project_id, "metaphone": metaphone} redis.sadd(redis_key, item.item_id) # Make sure we record that this project contains this metaphone redis_key = REDIS_KEY_METAPHONES % {"project_id": item.project_id} redis.sadd(redis_key, metaphone) def get_metaphones(self, words): """Get the metaphones for a given list of words""" metaphones = set() for word in words: metaphone = double_metaphone(unicode(word)) metaphones.add(metaphone[0].strip()) if(metaphone[1]): metaphones.add(metaphone[1].strip()) return metaphones def reindex_project(self, project_id): """Reindex an entire project, removing the existing index for the project""" # Remove all the existing index data redis_key = REDIS_KEY_METAPHONES % {"project_id": project_id} project_metaphones = redis.smembers(redis_key) if project_metaphones is None: project_metaphones = [] redis.delete(redis_key) for project_metaphone in project_metaphones: redis.delete(REDIS_KEY_METAPHONE % {"project_id": project_id, "metaphone": project_metaphone}) # Now index each item project = models.Project(project_id) for item in project.items: self.index_item(item) return True