This index implementation uses Xapian for searching and storage. It tends to be slightly faster than Ferret for indexing and significantly faster for searching due to precomputing thread membership.
Unstemmed
Xapian can very efficiently sort in ascending docid order. Sup always wants to sort by descending date, so this method maps between them. In order to handle multiple messages per second, we use a logistic curve centered around MIDDLE_DATE so that the slope (docid/s) is greatest in this time period. A docid collision is not an error - the code will pick the next smallest unused one.
dates are converted to integers for xapian, and are used for document ids, so we must ensure they're reasonably valid. this typically only affect spam.
Stemmed
# File lib/sup/xapian_index.rb, line 28 def initialize dir=BASE_DIR super @index_mutex = Monitor.new end
# File lib/sup/xapian_index.rb, line 96 def add_message m; sync_message m, true end
# File lib/sup/xapian_index.rb, line 76 def build_message id entry = synchronize { get_entry id } return unless entry source = SourceManager[entry[:source_id]] raise "invalid source #{entry[:source_id]}" unless source m = Message.new :source => source, :source_info => entry[:source_info], :labels => entry[:labels], :snippet => entry[:snippet] mk_person = lambda { |x| Person.new(*x.reverse!) } entry[:from] = mk_person[entry[:from]] entry[:to].map!(&mk_person) entry[:cc].map!(&mk_person) entry[:bcc].map!(&mk_person) m.load_from_index! entry m end
# File lib/sup/xapian_index.rb, line 64 def contains_id? id synchronize { find_docid(id) && true } end
# File lib/sup/xapian_index.rb, line 72 def delete id synchronize { @xapian.delete_document mkterm(:msgid, id) } end
# File lib/sup/xapian_index.rb, line 107 def each_id query={} offset = 0 page = EACH_ID_PAGE xapian_query = build_xapian_query query while true ids = run_query_ids xapian_query, offset, (offset+page) ids.each { |id| yield id } break if ids.size < page offset += page end end
# File lib/sup/xapian_index.rb, line 120 def each_id_by_date query={} each_id(query) { |id| yield id, lambda { build_message id } } end
# File lib/sup/xapian_index.rb, line 124 def each_message_in_thread_for m, opts={} # TODO thread by subject return unless doc = find_doc(m.id) queue = doc.value(THREAD_VALUENO).split(',') msgids = [m.id] seen_threads = Set.new seen_messages = Set.new [m.id] while not queue.empty? thread_id = queue.pop next if seen_threads.member? thread_id return false if opts[:skip_killed] && thread_killed?(thread_id) seen_threads << thread_id docs = term_docids(mkterm(:thread, thread_id)).map { |x| @xapian.document x } docs.each do |doc| msgid = doc.value MSGID_VALUENO next if seen_messages.member? msgid msgids << msgid seen_messages << msgid queue.concat doc.value(THREAD_VALUENO).split(',') end end msgids.each { |id| yield id, lambda { build_message id } } true end
# File lib/sup/xapian_index.rb, line 149 def load_contacts emails, opts={} contacts = Set.new num = opts[:num] || 20 each_id_by_date :participants => emails do |id,b| break if contacts.size >= num m = b.call ([m.from]+m.to+m.cc+m.bcc).compact.each { |p| contacts << [p.name, p.email] } end contacts.to_a.compact.map { |n,e| Person.new n, e }[0...num] end
# File lib/sup/xapian_index.rb, line 34 def load_index path = File.join(@dir, 'xapian') if File.exists? path @xapian = Xapian::WritableDatabase.new(path, Xapian::DB_OPEN) db_version = @xapian.get_metadata 'version' db_version = '0' if db_version.empty? if db_version != INDEX_VERSION fail "This Sup version expects a v#{INDEX_VERSION} index, but you have an existing v#{db_version} index. Please downgrade to your previous version and dump your labels before upgrading to this version (then run sup-sync --restore)." end else @xapian = Xapian::WritableDatabase.new(path, Xapian::DB_CREATE) @xapian.set_metadata 'version', INDEX_VERSION end @enquire = Xapian::Enquire.new @xapian @enquire.weighting_scheme = Xapian::BoolWeight.new @enquire.docid_order = Xapian::Enquire::ASCENDING end
# File lib/sup/xapian_index.rb, line 100 def num_results_for query={} xapian_query = build_xapian_query query matchset = run_query xapian_query, 0, 0, 100 matchset.matches_estimated end
# File lib/sup/xapian_index.rb, line 57 def optimize end
TODO share code with the Ferret index
# File lib/sup/xapian_index.rb, line 161 def parse_query s query = {} subs = HookManager.run("custom-search", :subs => s) || s subs = subs.gsub(%r\b(to|from):(\S+)\b/) do field, value = $1, $2 email_field, name_field = %w(email name).map { |x| "#{field}_#{x}" } if(p = ContactManager.contact_for(value)) "#{email_field}:#{p.email}" elsif value == "me" '(' + AccountManager.user_emails.map { |e| "#{email_field}:#{e}" }.join(' OR ') + ')' else "(#{email_field}:#{value} OR #{name_field}:#{value})" end end ## if we see a label:deleted or a label:spam term anywhere in the query ## string, we set the extra load_spam or load_deleted options to true. ## bizarre? well, because the query allows arbitrary parenthesized boolean ## expressions, without fully parsing the query, we can't tell whether ## the user is explicitly directing us to search spam messages or not. ## e.g. if the string is -(-(-(-(-label:spam)))), does the user want to ## search spam messages or not? ## ## so, we rely on the fact that turning these extra options ON turns OFF ## the adding of "-label:deleted" or "-label:spam" terms at the very ## final stage of query processing. if the user wants to search spam ## messages, not adding that is the right thing; if he doesn't want to ## search spam messages, then not adding it won't have any effect. query[:load_spam] = true if subs =~ %r\blabel:spam\b/ query[:load_deleted] = true if subs =~ %r\blabel:deleted\b/ ## gmail style "is" operator subs = subs.gsub(%r\b(is|has):(\S+)\b/) do field, label = $1, $2 case label when "read" "-label:unread" when "spam" query[:load_spam] = true "label:spam" when "deleted" query[:load_deleted] = true "label:deleted" else "label:#{$2}" end end ## gmail style attachments "filename" and "filetype" searches subs = subs.gsub(%r\b(filename|filetype):(\((.+?)\)\B|(\S+)\b)/) do field, name = $1, ($3 || $4) case field when "filename" debug "filename: translated #{field}:#{name} to attachment:\"#{name.downcase}\"" "attachment:\"#{name.downcase}\"" when "filetype" debug "filetype: translated #{field}:#{name} to attachment_extension:#{name.downcase}" "attachment_extension:#{name.downcase}" end end if $have_chronic lastdate = 2<<32 - 1 firstdate = 0 subs = subs.gsub(%r\b(before|on|in|during|after):(\((.+?)\)\B|(\S+)\b)/) do field, datestr = $1, ($3 || $4) realdate = Chronic.parse datestr, :guess => false, :context => :past if realdate case field when "after" debug "chronic: translated #{field}:#{datestr} to #{realdate.end}" "date:#{realdate.end.to_i}..#{lastdate}" when "before" debug "chronic: translated #{field}:#{datestr} to #{realdate.begin}" "date:#{firstdate}..#{realdate.end.to_i}" else debug "chronic: translated #{field}:#{datestr} to #{realdate}" "date:#{realdate.begin.to_i}..#{realdate.end.to_i}" end else raise ParseError, "can't understand date #{datestr.inspect}" end end end ## limit:42 restrict the search to 42 results subs = subs.gsub(%r\blimit:(\S+)\b/) do lim = $1 if lim =~ %r^\d+$/ query[:limit] = lim.to_i '' else raise ParseError, "non-numeric limit #{lim.inspect}" end end debug "translated query: #{subs.inspect}" qp = Xapian::QueryParser.new qp.database = @xapian qp.stemmer = Xapian::Stem.new(STEM_LANGUAGE) qp.stemming_strategy = Xapian::QueryParser::STEM_SOME qp.default_op = Xapian::Query::OP_AND qp.add_valuerangeprocessor(Xapian::NumberValueRangeProcessor.new(DATE_VALUENO, 'date:', true)) NORMAL_PREFIX.each { |k,v| qp.add_prefix k, v } BOOLEAN_PREFIX.each { |k,v| qp.add_boolean_prefix k, v } xapian_query = qp.parse_query(subs, Xapian::QueryParser::FLAG_PHRASE|Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_WILDCARD, PREFIX['body']) debug "parsed xapian query: #{xapian_query.description}" raise ParseError if xapian_query.nil? or xapian_query.empty? query[:qobj] = xapian_query query[:text] = s query end
# File lib/sup/xapian_index.rb, line 52 def save_index info "Flushing Xapian updates to disk. This may take a while..." @xapian.flush end
# File lib/sup/xapian_index.rb, line 60 def size synchronize { @xapian.doccount } end
# File lib/sup/xapian_index.rb, line 68 def source_for_id id synchronize { get_entry(id)[:source_id] } end
# File lib/sup/xapian_index.rb, line 97 def update_message m; sync_message m, true end
# File lib/sup/xapian_index.rb, line 98 def update_message_state m; sync_message m, false end