# -*- coding: utf-8 -*- """ sphinx.search ~~~~~~~~~~~~~ Create a search index for offline search. :copyright: 2007 by Armin Ronacher. :license: Python license. """ import re import pickle from collections import defaultdict from docutils.nodes import Text, NodeVisitor from .stemmer import PorterStemmer from .json import dump_json word_re = re.compile(r'\w+(?u)') class Stemmer(PorterStemmer): """ All those porter stemmer implementations look hideous. make at least the stem method nicer. """ def stem(self, word): return PorterStemmer.stem(self, word, 0, len(word) - 1) class WordCollector(NodeVisitor): """ A special visitor that collects words for the `IndexBuilder`. """ def __init__(self, document): NodeVisitor.__init__(self, document) self.found_words = [] def dispatch_visit(self, node): if node.__class__ is Text: self.found_words.extend(word_re.findall(node.astext())) class IndexBuilder(object): """ Helper class that creates a searchindex based on the doctrees passed to the `feed` method. """ formats = { 'json': dump_json, 'pickle': pickle.dumps } def __init__(self): self._filenames = {} self._mapping = {} self._titles = {} self._categories = {} self._stemmer = Stemmer() def dump(self, stream, format): """Dump the freezed index to a stream.""" stream.write(self.formats[format](self.freeze())) def freeze(self): """ Create a useable data structure. You can pass this output to the `SearchFrontend` to search the index. """ return [ [k for k, v in sorted(self._filenames.items(), key=lambda x: x[1])], dict(item for item in sorted(self._categories.items(), key=lambda x: x[0])), [v for k, v in sorted(self._titles.items(), key=lambda x: x[0])], dict(item for item in sorted(self._mapping.items(), key=lambda x: x[0])), ] def feed(self, filename, category, title, doctree): """Feed a doctree to the index.""" file_id = self._filenames.setdefault(filename, len(self._filenames)) self._titles[file_id] = title visitor = WordCollector(doctree) doctree.walk(visitor) self._categories.setdefault(category, set()).add(file_id) for word in word_re.findall(title) + visitor.found_words: self._mapping.setdefault(self._stemmer.stem(word.lower()), set()).add(file_id) class SearchFrontend(object): """ This class acts as a frontend for the search index. It can search a searchindex as provided by `IndexBuilder`. """ def __init__(self, index): self.filenames, self.areas, self.titles, self.words = index self._stemmer = Stemmer() def query(self, required, excluded, areas): file_map = defaultdict(set) for word in required: if word not in self.words: break for fid in self.words[word]: file_map[fid].add(word) return sorted(((self.filenames[fid], self.titles[fid]) for fid, words in file_map.iteritems() if len(words) == len(required) and any(fid in self.areas.get(area, ()) for area in areas) and not any(fid in self.words.get(word, ()) for word in excluded) ), key=lambda x: x[1].lower()) def search(self, searchstring, areas): required = set() excluded = set() for word in searchstring.split(): if word.startswith('-'): storage = excluded word = word[1:] else: storage = required storage.add(self._stemmer.stem(word.lower())) return self.query(required, excluded, areas)