1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 Interface to the Xapian indexing engine for the Translate Toolkit
25
26 Xapian v1.0 or higher is supported.
27
28 If you are interested in writing an interface for Xapian 0.x, then
29 you should checkout the following::
30 svn export -r 7235 https://translate.svn.sourceforge.net/svnroot/translate/src/branches/translate-search-indexer-generic-merging/translate/search/indexer/
31 It is not completely working, but it should give you a good start.
32 """
33
34 __revision__ = "$Id: XapianIndexer.py 17248 2011-01-31 16:35:12Z friedelwolff $"
35
36
37 import sys
38 import re
39
40
41 if 'apache' in sys.modules or '_apache' in sys.modules or 'mod_wsgi' in sys.modules:
42
44 return [int(i) for i in version.split('.')]
45
46 import subprocess
47
48 try:
49 command = subprocess.Popen(['xapian-check', '--version'], stdout=subprocess.PIPE)
50 stdout, stderr = command.communicate()
51 if _str2version(re.match('.*([0-9]+\.[0-9]+\.[0-9]+).*', stdout).groups()[0]) < [1, 0, 13]:
52 raise ImportError("Running under apache, can't load xapain")
53 except:
54
55 raise ImportError("Running under apache, can't load xapian")
56
57 import CommonIndexer
58 import xapian
59 import os
60 import time
61 import logging
62
63
65 return xapian.major_version() > 0
66
67
68
69
70
71 _MAX_TERM_LENGTH = 128
72
73
75 """interface to the xapian (http://xapian.org) indexer
76 """
77
78 QUERY_TYPE = xapian.Query
79 INDEX_DIRECTORY_NAME = "xapian"
80
81 - def __init__(self, basedir, analyzer=None, create_allowed=True):
82 """initialize or open a xapian database
83
84 @raise ValueError: the given location exists, but the database type
85 is incompatible (e.g. created by a different indexing engine)
86 @raise OSError: the database failed to initialize
87
88 @param basedir: the parent directory of the database
89 @type basedir: str
90 @param analyzer: bitwise combination of possible analyzer flags
91 to be used as the default analyzer for this database. Leave it empty
92 to use the system default analyzer (self.ANALYZER_DEFAULT).
93 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ...
94 @type analyzer: int
95 @param create_allowed: create the database, if necessary; default: True
96 @type create_allowed: bool
97 """
98
99 super(XapianDatabase, self).__init__(basedir, analyzer=analyzer,
100 create_allowed=create_allowed)
101 self.reader = None
102 self.writer = None
103 if os.path.exists(self.location):
104
105 try:
106 self.reader = xapian.Database(self.location)
107 except xapian.DatabaseOpeningError, err_msg:
108 raise ValueError("Indexer: failed to open xapian database " \
109 + "(%s) - maybe it is not a xapian database: %s" \
110 % (self.location, str(err_msg)))
111 else:
112
113 if not create_allowed:
114 raise OSError("Indexer: skipping database creation")
115 try:
116
117 parent_path = os.path.dirname(self.location)
118 if not os.path.isdir(parent_path):
119
120 os.makedirs(parent_path)
121 except IOError, err_msg:
122 raise OSError("Indexer: failed to create the parent " \
123 + "directory (%s) of the indexing database: %s" \
124 % (parent_path, str(err_msg)))
125 try:
126 self.writer = xapian.WritableDatabase(self.location,
127 xapian.DB_CREATE_OR_OPEN)
128 self.flush()
129 except xapian.DatabaseOpeningError, err_msg:
130 raise OSError("Indexer: failed to open or create a xapian " \
131 + "database (%s): %s" % (self.location, str(err_msg)))
132
136
137 - def flush(self, optimize=False):
138 """force to write the current changes to disk immediately
139
140 @param optimize: ignored for xapian
141 @type optimize: bool
142 """
143
144 if self._writer_is_open():
145 self._writer_close()
146 self._index_refresh()
147
154
156 """generate a query based on an existing query object
157
158 basically this function should just create a copy of the original
159
160 @param query: the original query object
161 @type query: xapian.Query
162 @return: the resulting query object
163 @rtype: xapian.Query
164 """
165
166 return xapian.Query(query)
167
170 """generate a query for a plain term of a string query
171
172 basically this function parses the string and returns the resulting
173 query
174
175 @param text: the query string
176 @type text: str
177 @param require_all: boolean operator
178 (True -> AND (default) / False -> OR)
179 @type require_all: bool
180 @param analyzer: Define query options (partial matching, exact matching,
181 tokenizing, ...) as bitwise combinations of
182 CommonIndexer.ANALYZER_???.
183 This can override previously defined field analyzer settings.
184 If analyzer is None (default), then the configured analyzer for the
185 field is used.
186 @type analyzer: int
187 @return: resulting query object
188 @rtype: xapian.Query
189 """
190 qp = xapian.QueryParser()
191 qp.set_database(self.reader)
192 if require_all:
193 qp.set_default_op(xapian.Query.OP_AND)
194 else:
195 qp.set_default_op(xapian.Query.OP_OR)
196 if analyzer is None:
197 analyzer = self.analyzer
198 if analyzer & self.ANALYZER_PARTIAL > 0:
199 match_flags = xapian.QueryParser.FLAG_PARTIAL
200 return qp.parse_query(text, match_flags)
201 elif analyzer == self.ANALYZER_EXACT:
202
203 return xapian.Query(text)
204 else:
205
206 match_flags = 0
207 return qp.parse_query(text, match_flags)
208
210 """generate a field query
211
212 this functions creates a field->value query
213
214 @param field: the fieldname to be used
215 @type field: str
216 @param value: the wanted value of the field
217 @type value: str
218 @param analyzer: Define query options (partial matching, exact matching,
219 tokenizing, ...) as bitwise combinations of
220 CommonIndexer.ANALYZER_???.
221 This can override previously defined field analyzer settings.
222 If analyzer is None (default), then the configured analyzer for the
223 field is used.
224 @type analyzer: int
225 @return: the resulting query object
226 @rtype: xapian.Query
227 """
228 if analyzer is None:
229 analyzer = self.analyzer
230 if analyzer == self.ANALYZER_EXACT:
231
232 return xapian.Query("%s%s" % (field.upper(), value))
233
234 qp = xapian.QueryParser()
235 qp.set_database(self.reader)
236 if (analyzer & self.ANALYZER_PARTIAL > 0):
237
238 match_flags = xapian.QueryParser.FLAG_PARTIAL
239 return qp.parse_query(value, match_flags, field.upper())
240 else:
241
242 match_flags = 0
243 return qp.parse_query(value, match_flags, field.upper())
244
246 """generate a combined query
247
248 @param queries: list of the original queries
249 @type queries: list of xapian.Query
250 @param require_all: boolean operator
251 (True -> AND (default) / False -> OR)
252 @type require_all: bool
253 @return: the resulting combined query object
254 @rtype: xapian.Query
255 """
256 if require_all:
257 query_op = xapian.Query.OP_AND
258 else:
259 query_op = xapian.Query.OP_OR
260 return xapian.Query(query_op, queries)
261
263 """create an empty document to be filled and added to the index later
264
265 @return: the new document object
266 @rtype: xapian.Document
267 """
268 return xapian.Document()
269
271 """add a term to a document
272
273 @param document: the document to be changed
274 @type document: xapian.Document
275 @param term: a single term to be added
276 @type term: str
277 @param tokenize: should the term be tokenized automatically
278 @type tokenize: bool
279 """
280 if tokenize:
281 term_gen = xapian.TermGenerator()
282 term_gen.set_document(document)
283 term_gen.index_text(term)
284 else:
285 document.add_term(_truncate_term_length(term))
286
288 """add a field term to a document
289
290 @param document: the document to be changed
291 @type document: xapian.Document
292 @param field: name of the field
293 @type field: str
294 @param term: term to be associated to the field
295 @type term: str
296 @param tokenize: should the term be tokenized automatically
297 @type tokenize: bool
298 """
299 if tokenize:
300 term_gen = xapian.TermGenerator()
301 term_gen.set_document(document)
302 term_gen.index_text(term, 1, field.upper())
303 else:
304 document.add_term(_truncate_term_length("%s%s" % \
305 (field.upper(), term)))
306
308 """add a prepared document to the index database
309
310 @param document: the document to be added
311 @type document: xapian.Document
312 """
313
314 self._writer_open()
315 self.writer.add_document(document)
316
318 """begin a transaction
319
320 Xapian supports transactions to group multiple database modifications.
321 This avoids intermediate flushing and therefore increases performance.
322 """
323 self._writer_open()
324 self.writer.begin_transaction()
325
327 """cancel an ongoing transaction
328
329 no changes since the last execution of 'begin_transcation' are written
330 """
331 self.writer.cancel_transaction()
332 self._writer_close()
333
335 """submit the changes of an ongoing transaction
336
337 all changes since the last execution of 'begin_transaction' are written
338 """
339 self.writer.commit_transaction()
340 self._writer_close()
341
343 """return an object containing the results of a query
344
345 @param query: a pre-compiled xapian query
346 @type query: xapian.Query
347 @return: an object that allows access to the results
348 @rtype: XapianIndexer.CommonEnquire
349 """
350 enquire = xapian.Enquire(self.reader)
351 enquire.set_query(query)
352 return XapianEnquire(enquire)
353
355 """delete a specified document
356
357 @param docid: the document ID to be deleted
358 @type docid: int
359 """
360
361 self._writer_open()
362 try:
363 self.writer.delete_document(docid)
364 return True
365 except xapian.DocNotFoundError:
366 return False
367
368 - def search(self, query, fieldnames):
369 """return a list of the contents of specified fields for all matches of
370 a query
371
372 @param query: the query to be issued
373 @type query: xapian.Query
374 @param fieldnames: the name(s) of a field of the document content
375 @type fieldnames: string | list of strings
376 @return: a list of dicts containing the specified field(s)
377 @rtype: list of dicts
378 """
379 result = []
380 if isinstance(fieldnames, basestring):
381 fieldnames = [fieldnames]
382 try:
383 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames))
384 except xapian.DatabaseModifiedError:
385 self._index_refresh()
386 self._walk_matches(query, _extract_fieldvalues, (result, fieldnames))
387 return result
388
390 if not self._writer_is_open():
391 lockfile = os.path.join(self.location, 'flintlock')
392 if os.path.exists(lockfile) and (time.time() - os.path.getmtime(lockfile)) / 60 > 15:
393 logging.warning("stale lock found in %s, removing.", self.location)
394 os.remove(lockfile)
395
397 """open write access for the indexing database and acquire an exclusive lock"""
398 if not self._writer_is_open():
399 self._delete_stale_lock()
400 try:
401 self.writer = xapian.WritableDatabase(self.location, xapian.DB_OPEN)
402 except xapian.DatabaseOpeningError, err_msg:
403
404 raise ValueError("Indexer: failed to open xapian database " \
405 + "(%s) - maybe it is not a xapian database: %s" \
406 % (self.location, str(err_msg)))
407
409 """close indexing write access and remove database lock"""
410 if self._writer_is_open():
411 self.writer.flush()
412 self.writer = None
413
415 """check if the indexing write access is currently open"""
416 return hasattr(self, "writer") and not self.writer is None
417
419 """re-read the indexer database"""
420 try:
421 if self.reader is None:
422 self.reader = xapian.Database(self.location)
423 else:
424 self.reader.reopen()
425 except xapian.DatabaseOpeningError, err_msg:
426 raise ValueError("Indexer: failed to open xapian database " \
427 + "(%s) - maybe it is not a xapian database: %s" \
428 % (self.location, str(err_msg)))
429
430
432 """interface to the xapian object for storing sets of matches
433 """
434
436 """return a specified number of qualified matches of a previous query
437
438 @param start: index of the first match to return (starting from zero)
439 @type start: int
440 @param number: the number of matching entries to return
441 @type number: int
442 @return: a set of matching entries and some statistics
443 @rtype: tuple of (returned number, available number, matches)
444 "matches" is a dictionary of::
445 ["rank", "percent", "document", "docid"]
446 """
447 matches = self.enquire.get_mset(start, number)
448 result = []
449 for match in matches:
450 elem = {}
451 elem["rank"] = match.rank
452 elem["docid"] = match.docid
453 elem["percent"] = match.percent
454 elem["document"] = match.document
455 result.append(elem)
456 return (matches.size(), matches.get_matches_estimated(), result)
457
458
460 """truncate the length of a term string length to the maximum allowed
461 for xapian terms
462
463 @param term: the value of the term, that should be truncated
464 @type term: str
465 @param taken: since a term consists of the name of the term and its
466 actual value, this additional parameter can be used to reduce the
467 maximum count of possible characters
468 @type taken: int
469 @return: the truncated string
470 @rtype: str
471 """
472 if len(term) > _MAX_TERM_LENGTH - taken:
473 return term[0:_MAX_TERM_LENGTH - taken - 1]
474 else:
475 return term
476
477
479 """add a dict of field values to a list
480
481 usually this function should be used together with '_walk_matches'
482 for traversing a list of matches
483 @param match: a single match object
484 @type match: xapian.MSet
485 @param result: the resulting dict will be added to this list
486 @type result: list of dict
487 @param fieldnames: the names of the fields to be added to the dict
488 @type fieldnames: list of str
489 """
490
491 item_fields = {}
492
493 for term in match["document"].termlist():
494 for fname in fieldnames:
495 if ((fname is None) and re.match("[^A-Z]", term.term)):
496 value = term.term
497 elif re.match("%s[^A-Z]" % str(fname).upper(), term.term):
498 value = term.term[len(fname):]
499 else:
500 continue
501
502 if fname in item_fields:
503 item_fields[fname].append(value)
504 else:
505 item_fields[fname] = [value]
506 result.append(item_fields)
507