Package translate :: Package search :: Package indexing :: Module PyLuceneIndexer
[hide private]
[frames] | no frames]

Source Code for Module translate.search.indexing.PyLuceneIndexer

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright 2008 Zuza Software Foundation 
  4  # 
  5  # This file is part of translate. 
  6  # 
  7  # translate is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  # 
 12  # translate is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with translate; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22   
 23  """ 
 24  interface for the PyLucene (v2.x) indexing engine 
 25   
 26  take a look at PyLuceneIndexer1.py for the PyLucene v1.x interface 
 27  """ 
 28   
 29  __revision__ = "$Id: PyLuceneIndexer.py 15717 2010-09-03 14:50:28Z alaaosh $" 
 30   
 31  import re 
 32  import os 
 33  import time 
 34  import logging 
 35   
 36  # try to import the PyLucene package (with the two possible names) 
 37  # remember the type of the detected package (compiled with jcc (>=v2.3) or 
 38  # with gcj (<=v2.2) 
 39  try: 
 40      import PyLucene 
 41      _COMPILER = 'gcj' 
 42  except ImportError: 
 43      # if this fails, then there is no pylucene installed 
 44      import lucene 
 45      PyLucene = lucene 
 46      PyLucene.initVM(PyLucene.CLASSPATH) 
 47      _COMPILER = 'jcc' 
 48   
 49  import CommonIndexer 
 50   
 51   
 52  UNNAMED_FIELD_NAME = "FieldWithoutAName" 
 53  MAX_FIELD_SIZE = 1048576 
 54   
 55   
56 -def is_available():
57 return _get_pylucene_version() == 2
58 59
60 -class PyLuceneDatabase(CommonIndexer.CommonDatabase):
61 """manage and use a pylucene indexing database""" 62 63 QUERY_TYPE = PyLucene.Query 64 INDEX_DIRECTORY_NAME = "lucene" 65
66 - def __init__(self, basedir, analyzer=None, create_allowed=True):
67 """initialize or open an indexing database 68 69 Any derived class must override __init__. 70 71 @raise ValueError: the given location exists, but the database type 72 is incompatible (e.g. created by a different indexing engine) 73 @raise OSError: the database failed to initialize 74 75 @param basedir: the parent directory of the database 76 @type basedir: str 77 @param analyzer: bitwise combination of possible analyzer flags 78 to be used as the default analyzer for this database. Leave it empty 79 to use the system default analyzer (self.ANALYZER_DEFAULT). 80 see self.ANALYZER_TOKENIZE, self.ANALYZER_PARTIAL, ... 81 @type analyzer: int 82 @param create_allowed: create the database, if necessary; default: True 83 @type create_allowed: bool 84 """ 85 jvm = PyLucene.getVMEnv() 86 jvm.attachCurrentThread() 87 super(PyLuceneDatabase, self).__init__(basedir, analyzer=analyzer, 88 create_allowed=create_allowed) 89 self.pyl_analyzer = PyLucene.StandardAnalyzer() 90 self.writer = None 91 self.reader = None 92 self.index_version = None 93 try: 94 # try to open an existing database 95 tempreader = PyLucene.IndexReader.open(self.location) 96 tempreader.close() 97 except PyLucene.JavaError, err_msg: 98 # Write an error out, in case this is a real problem instead of an absence of an index 99 # TODO: turn the following two lines into debug output 100 #errorstr = str(e).strip() + "\n" + self.errorhandler.traceback_str() 101 #DEBUG_FOO("could not open index, so going to create: " + errorstr) 102 # Create the index, so we can open cached readers on it 103 if not create_allowed: 104 raise OSError("Indexer: skipping database creation") 105 try: 106 # create the parent directory if it does not exist 107 parent_path = os.path.dirname(self.location) 108 if not os.path.isdir(parent_path): 109 # recursively create all directories up to parent_path 110 os.makedirs(parent_path) 111 except IOError, err_msg: 112 raise OSError("Indexer: failed to create the parent " \ 113 + "directory (%s) of the indexing database: %s" \ 114 % (parent_path, err_msg)) 115 try: 116 tempwriter = PyLucene.IndexWriter(self.location, 117 self.pyl_analyzer, True) 118 tempwriter.close() 119 except PyLucene.JavaError, err_msg: 120 raise OSError("Indexer: failed to open or create a Lucene" \ 121 + " database (%s): %s" % (self.location, err_msg)) 122 # the indexer is initialized - now we prepare the searcher 123 # windows file locking seems inconsistent, so we try 10 times 124 numtries = 0 125 #self.dir_lock.acquire(blocking=True) 126 # read "self.reader", "self.indexVersion" and "self.searcher" 127 try: 128 while numtries < 10: 129 try: 130 self.reader = PyLucene.IndexReader.open(self.location) 131 self.indexVersion = self.reader.getCurrentVersion( 132 self.location) 133 self.searcher = PyLucene.IndexSearcher(self.reader) 134 break 135 except PyLucene.JavaError, e: 136 # store error message for possible later re-raise (below) 137 lock_error_msg = e 138 time.sleep(0.01) 139 numtries += 1 140 else: 141 # locking failed for 10 times 142 raise OSError("Indexer: failed to lock index database" \ 143 + " (%s)" % lock_error_msg) 144 finally: 145 pass 146 # self.dir_lock.release() 147 # initialize the searcher and the reader 148 self._index_refresh()
149
150 - def __del__(self):
151 """remove lock and close writer after loosing the last reference""" 152 jvm = PyLucene.getVMEnv() 153 jvm.attachCurrentThread() 154 self._writer_close() 155 if hasattr(self, "reader") and self.reader is not None: 156 self.reader.close() 157 self.reader = None 158 if hasattr(self, "searcher") and self.searcher is not None: 159 self.searcher.close() 160 self.searcher = None
161
162 - def flush(self, optimize=False):
163 """flush the content of the database - to force changes to be written 164 to disk 165 166 some databases also support index optimization 167 168 @param optimize: should the index be optimized if possible? 169 @type optimize: bool 170 """ 171 keep_open = self._writer_is_open() 172 self._writer_open() 173 try: 174 if optimize: 175 self.writer.optimize() 176 finally: 177 self.writer.flush() 178 if not keep_open: 179 self._writer_close()
180
181 - def make_query(self, *args, **kwargs):
182 jvm = PyLucene.getVMEnv() 183 jvm.attachCurrentThread() 184 return super(PyLuceneDatabase, self).make_query(*args, **kwargs)
185
186 - def _create_query_for_query(self, query):
187 """generate a query based on an existing query object 188 189 basically this function should just create a copy of the original 190 191 @param query: the original query object 192 @type query: PyLucene.Query 193 @return: resulting query object 194 @rtype: PyLucene.Query 195 """ 196 # TODO: a deep copy or a clone would be safer 197 # somehow not working (returns "null"): copy.deepcopy(query) 198 return query
199
200 - def _create_query_for_string(self, text, require_all=True, 201 analyzer=None):
202 """generate a query for a plain term of a string query 203 204 basically this function parses the string and returns the resulting 205 query 206 207 @param text: the query string 208 @type text: str 209 @param require_all: boolean operator 210 (True -> AND (default) / False -> OR) 211 @type require_all: bool 212 @param analyzer: the analyzer to be used 213 possible analyzers are: 214 - L{CommonDatabase.ANALYZER_TOKENIZE} 215 the field value is splitted to be matched word-wise 216 - L{CommonDatabase.ANALYZER_PARTIAL} 217 the field value must start with the query string 218 - L{CommonDatabase.ANALYZER_EXACT} 219 keep special characters and the like 220 @type analyzer: bool 221 @return: resulting query object 222 @rtype: PyLucene.Query 223 """ 224 if analyzer is None: 225 analyzer = self.analyzer 226 if analyzer == self.ANALYZER_EXACT: 227 analyzer_obj = PyLucene.KeywordAnalyzer() 228 else: 229 text = _escape_term_value(text) 230 analyzer_obj = PyLucene.StandardAnalyzer() 231 qp = PyLucene.QueryParser(UNNAMED_FIELD_NAME, analyzer_obj) 232 if (analyzer & self.ANALYZER_PARTIAL > 0): 233 # PyLucene uses explicit wildcards for partial matching 234 text += "*" 235 if require_all: 236 qp.setDefaultOperator(qp.Operator.AND) 237 else: 238 qp.setDefaultOperator(qp.Operator.OR) 239 return qp.parse(text)
240
241 - def _create_query_for_field(self, field, value, analyzer=None):
242 """generate a field query 243 244 this functions creates a field->value query 245 246 @param field: the fieldname to be used 247 @type field: str 248 @param value: the wanted value of the field 249 @type value: str 250 @param analyzer: the analyzer to be used 251 possible analyzers are: 252 - L{CommonDatabase.ANALYZER_TOKENIZE} 253 the field value is splitted to be matched word-wise 254 - L{CommonDatabase.ANALYZER_PARTIAL} 255 the field value must start with the query string 256 - L{CommonDatabase.ANALYZER_EXACT} 257 keep special characters and the like 258 @type analyzer: bool 259 @return: resulting query object 260 @rtype: PyLucene.Query 261 """ 262 if analyzer is None: 263 analyzer = self.analyzer 264 if analyzer == self.ANALYZER_EXACT: 265 analyzer_obj = PyLucene.KeywordAnalyzer() 266 else: 267 value = _escape_term_value(value) 268 analyzer_obj = PyLucene.StandardAnalyzer() 269 qp = PyLucene.QueryParser(field, analyzer_obj) 270 if (analyzer & self.ANALYZER_PARTIAL > 0): 271 # PyLucene uses explicit wildcards for partial matching 272 value += "*" 273 return qp.parse(value)
274
275 - def _create_query_combined(self, queries, require_all=True):
276 """generate a combined query 277 278 @param queries: list of the original queries 279 @type queries: list of PyLucene.Query 280 @param require_all: boolean operator 281 (True -> AND (default) / False -> OR) 282 @type require_all: bool 283 @return: the resulting combined query object 284 @rtype: PyLucene.Query 285 """ 286 combined_query = PyLucene.BooleanQuery() 287 for query in queries: 288 combined_query.add( 289 PyLucene.BooleanClause(query, _occur(require_all, False))) 290 return combined_query
291
292 - def _create_empty_document(self):
293 """create an empty document to be filled and added to the index later 294 295 @return: the new document object 296 @rtype: PyLucene.Document 297 """ 298 return PyLucene.Document()
299
300 - def _add_plain_term(self, document, term, tokenize=True):
301 """add a term to a document 302 303 @param document: the document to be changed 304 @type document: PyLucene.Document 305 @param term: a single term to be added 306 @type term: str 307 @param tokenize: should the term be tokenized automatically 308 @type tokenize: bool 309 """ 310 if tokenize: 311 token_flag = PyLucene.Field.Index.TOKENIZED 312 else: 313 token_flag = PyLucene.Field.Index.UN_TOKENIZED 314 document.add(PyLucene.Field(str(UNNAMED_FIELD_NAME), term, 315 PyLucene.Field.Store.YES, token_flag))
316
317 - def _add_field_term(self, document, field, term, tokenize=True):
318 """add a field term to a document 319 320 @param document: the document to be changed 321 @type document: PyLucene.Document 322 @param field: name of the field 323 @type field: str 324 @param term: term to be associated to the field 325 @type term: str 326 @param tokenize: should the term be tokenized automatically 327 @type tokenize: bool 328 """ 329 if tokenize: 330 token_flag = PyLucene.Field.Index.TOKENIZED 331 else: 332 token_flag = PyLucene.Field.Index.UN_TOKENIZED 333 document.add(PyLucene.Field(str(field), term, 334 PyLucene.Field.Store.YES, token_flag))
335
336 - def _add_document_to_index(self, document):
337 """add a prepared document to the index database 338 339 @param document: the document to be added 340 @type document: PyLucene.Document 341 """ 342 self._writer_open() 343 self.writer.addDocument(document)
344
345 - def begin_transaction(self):
346 """PyLucene does not support transactions 347 348 Thus this function just opens the database for write access. 349 Call "cancel_transaction" or "commit_transaction" to close write 350 access in order to remove the exclusive lock from the database 351 directory. 352 """ 353 jvm = PyLucene.getVMEnv() 354 jvm.attachCurrentThread() 355 self._writer_open()
356
357 - def cancel_transaction(self):
358 """PyLucene does not support transactions 359 360 Thus this function just closes the database write access and removes 361 the exclusive lock. 362 363 See 'start_transaction' for details. 364 """ 365 if self._writer_is_open(): 366 self.writer.abort() 367 self._writer_close()
368
369 - def commit_transaction(self):
370 """PyLucene does not support transactions 371 372 Thus this function just closes the database write access and removes 373 the exclusive lock. 374 375 See 'start_transaction' for details. 376 """ 377 self._writer_close() 378 self._index_refresh()
379
380 - def get_query_result(self, query):
381 """return an object containing the results of a query 382 383 @param query: a pre-compiled query 384 @type query: a query object of the real implementation 385 @return: an object that allows access to the results 386 @rtype: subclass of CommonEnquire 387 """ 388 return PyLuceneHits(self.searcher.search(query))
389
390 - def delete_doc(self, ident):
391 super(PyLuceneDatabase, self).delete_doc(ident) 392 self.reader.flush() 393 self._index_refresh()
394
395 - def delete_document_by_id(self, docid):
396 """delete a specified document 397 398 @param docid: the document ID to be deleted 399 @type docid: int 400 """ 401 if self._writer_is_open(): 402 self._writer_close() 403 try: 404 self.reader.deleteDocument(docid) 405 except PyLucene.JavaError: 406 self._index_refresh() 407 self.reader.deleteDocument(docid)
408
409 - def search(self, query, fieldnames):
410 """return a list of the contents of specified fields for all matches of 411 a query 412 413 @param query: the query to be issued 414 @type query: a query object of the real implementation 415 @param fieldnames: the name(s) of a field of the document content 416 @type fieldnames: string | list of strings 417 @return: a list of dicts containing the specified field(s) 418 @rtype: list of dicts 419 """ 420 if isinstance(fieldnames, basestring): 421 fieldnames = [fieldnames] 422 hits = self.searcher.search(query) 423 if _COMPILER == 'jcc': 424 # add the ranking number and the retrieved document to the array 425 hits = [(hit, hits.doc(hit)) for hit in range(hits.length())] 426 result = [] 427 for hit, doc in hits: 428 fields = {} 429 for fieldname in fieldnames: 430 # take care for the special field "None" 431 if fieldname is None: 432 pyl_fieldname = UNNAMED_FIELD_NAME 433 else: 434 pyl_fieldname = fieldname 435 fields[fieldname] = doc.getValues(pyl_fieldname) 436 result.append(fields) 437 return result
438
439 - def _delete_stale_lock(self):
440 if self.reader.isLocked(self.location): 441 #HACKISH: there is a lock but Lucene api can't tell us how old it 442 # is, will have to check the filesystem 443 try: 444 # in try block just in case lock disappears on us while testing it 445 stat = os.stat(os.path.join(self.location, 'write.lock')) 446 age = (time.time() - stat.st_mtime) / 60 447 if age > 15: 448 logging.warning("stale lock found in %s, removing.", self.location) 449 self.reader.unlock(self.reader.directory()) 450 except: 451 pass
452
453 - def _writer_open(self):
454 """open write access for the indexing database and acquire an 455 exclusive lock 456 """ 457 if not self._writer_is_open(): 458 self._delete_stale_lock() 459 self.writer = PyLucene.IndexWriter(self.location, self.pyl_analyzer, 460 False) 461 # "setMaxFieldLength" is available since PyLucene v2 462 # we must stay compatible to v1 for the derived class 463 # (PyLuceneIndexer1) - thus we make this step optional 464 if hasattr(self.writer, "setMaxFieldLength"): 465 self.writer.setMaxFieldLength(MAX_FIELD_SIZE)
466 # do nothing, if it is already open 467
468 - def _writer_close(self):
469 """close indexing write access and remove the database lock""" 470 if self._writer_is_open(): 471 self.writer.close() 472 self.writer = None
473
474 - def _writer_is_open(self):
475 """check if the indexing write access is currently open""" 476 return hasattr(self, "writer") and not self.writer is None
477
478 - def _index_refresh(self):
479 """re-read the indexer database""" 480 try: 481 if self.reader is None or self.searcher is None: 482 self.reader = PyLucene.IndexReader.open(self.location) 483 self.searcher = PyLucene.IndexSearcher(self.reader) 484 elif self.index_version != self.reader.getCurrentVersion( \ 485 self.location): 486 self.searcher.close() 487 self.reader.close() 488 self.reader = PyLucene.IndexReader.open(self.location) 489 self.searcher = PyLucene.IndexSearcher(self.reader) 490 self.index_version = self.reader.getCurrentVersion(self.location) 491 except PyLucene.JavaError, e: 492 # TODO: add some debugging output? 493 #self.errorhandler.logerror("Error attempting to read index - try reindexing: "+str(e)) 494 pass
495 496
497 -class PyLuceneHits(CommonIndexer.CommonEnquire):
498 """an enquire object contains the information about the result of a request 499 """ 500
501 - def get_matches(self, start, number):
502 """return a specified number of qualified matches of a previous query 503 504 @param start: index of the first match to return (starting from zero) 505 @type start: int 506 @param number: the number of matching entries to return 507 @type number: int 508 @return: a set of matching entries and some statistics 509 @rtype: tuple of (returned number, available number, matches) 510 "matches" is a dictionary of:: 511 ["rank", "percent", "document", "docid"] 512 """ 513 # check if requested results do not exist 514 # stop is the lowest index number to be ommitted 515 stop = start + number 516 if stop > self.enquire.length(): 517 stop = self.enquire.length() 518 # invalid request range 519 if stop <= start: 520 return (0, self.enquire.length(), []) 521 result = [] 522 for index in range(start, stop): 523 item = {} 524 item["rank"] = index 525 item["docid"] = self.enquire.id(index) 526 item["percent"] = self.enquire.score(index) 527 item["document"] = self.enquire.doc(index) 528 result.append(item) 529 return (stop-start, self.enquire.length(), result)
530 531
532 -def _occur(required, prohibited):
533 if required == True and prohibited == False: 534 return PyLucene.BooleanClause.Occur.MUST 535 elif required == False and prohibited == False: 536 return PyLucene.BooleanClause.Occur.SHOULD 537 elif required == False and prohibited == True: 538 return PyLucene.BooleanClause.Occur.MUST_NOT 539 else: 540 # It is an error to specify a clause as both required 541 # and prohibited 542 return None
543 544
545 -def _get_pylucene_version():
546 """get the installed pylucene version 547 548 @return: 1 -> PyLucene v1.x / 2 -> PyLucene v2.x / 0 -> unknown 549 @rtype: int 550 """ 551 version = PyLucene.VERSION 552 if version.startswith("1."): 553 return 1 554 elif version.startswith("2."): 555 return 2 556 else: 557 return 0
558 559
560 -def _escape_term_value(text):
561 return re.sub("\*", "", text)
562