/** * Distribution License: * JSword is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License, version 2.1 as published by * the Free Software Foundation. This program is distributed in the hope * that it will be useful, but WITHOUT ANY WARRANTY; without even the * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * The License is available on the internet at: * http://www.gnu.org/copyleft/lgpl.html * or by writing to: * Free Software Foundation, Inc. * 59 Temple Place - Suite 330 * Boston, MA 02111-1307, USA * * Copyright: 2005 * The copyright to this program is held by it's authors. * * ID: $Id:PdaLuceneIndexCreator.java 984 2006-01-23 14:18:33 -0500 (Mon, 23 Jan 2006) dmsmith $ */ package org.crosswire.jsword.index.lucene; import java.io.File; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.List; import net.bible.service.common.CommonUtils; import net.bible.service.common.Logger; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.Searcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.crosswire.common.progress.JobManager; import org.crosswire.common.progress.Progress; import org.crosswire.common.util.NetUtil; import org.crosswire.common.util.Reporter; import org.crosswire.jsword.JSMsg; import org.crosswire.jsword.book.Book; import org.crosswire.jsword.book.BookCategory; import org.crosswire.jsword.book.BookData; import org.crosswire.jsword.book.BookException; import org.crosswire.jsword.book.DataPolice; import org.crosswire.jsword.book.FeatureType; import org.crosswire.jsword.book.OSISUtil; import org.crosswire.jsword.index.IndexStatus; import org.crosswire.jsword.index.lucene.analysis.LuceneAnalyzer; import org.crosswire.jsword.passage.Key; import org.crosswire.jsword.passage.PassageKeyFactory; import org.jdom.Element; /** * Implement the SearchEngine using Lucene as the search engine. * * @see gnu.lgpl.License for license details.
* The copyright to this program is held by it's authors. * @author Joe Walker [joe at eireneh dot com] * @author Martin Denham [mjdenham at gmail dot com] */ public class PdaLuceneIndexCreator { /* * The following fields are named the same as Sword in the hopes of sharing * indexes. */ /** * The Lucene field for the osisID */ public static final String FIELD_KEY = "key"; /** * The Lucene field for the text contents */ public static final String FIELD_BODY = "content"; /** * The Lucene field for the strong numbers */ public static final String FIELD_STRONG = "strong"; /** we are on a device with limited ram so don't use too much */ private static final int MAX_RAM_BUFFER_SIZE_MB = 1; private static final String TAG = "PdaLuceneIndexCreator"; private static final Logger logger = new Logger(TAG); /** * Generate an index to use, telling the job about progress as you go. * * @throws BookException * If we fail to read the index files */ public PdaLuceneIndexCreator(Book book, URI storage, boolean create) throws BookException { assert create; logger.info("Index target dir:"+storage.getPath()); this.book = book; File finalPath = null; try { finalPath = NetUtil.getAsFile(storage); this.path = finalPath.getCanonicalPath(); } catch (IOException ex) { // TRANSLATOR: Error condition: Could not initialize a search index. Lucene is the name of the search technology being used. throw new BookException(JSMsg.gettext("Failed to initialize Lucene search engine."), ex); } // Indexing the book is a good way to police data errors. DataPolice.setBook(book.getBookMetaData()); // TRANSLATOR: Progress label indicating the start of indexing. {0} is a placeholder for the book's short name. String jobName = JSMsg.gettext("Creating index. Processing {0}", book.getInitials()); Progress job = JobManager.createJob(jobName, Thread.currentThread()); job.beginJob(jobName); IndexStatus finalStatus = IndexStatus.UNDONE; List errors = new ArrayList(); File tempPath = new File(path + '.' + IndexStatus.CREATING.toString()); try { // this can throw an error if indexing is misconfigured so needs to be in the try/catch block Analyzer analyzer = new LuceneAnalyzer(book); synchronized (CREATING) { book.setIndexStatus(IndexStatus.CREATING); // Create the index in core. IndexWriter writer = null; try { Directory destination = FSDirectory.open(new File(tempPath.getCanonicalPath())); writer = new IndexWriter(destination, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); writer.setRAMBufferSizeMB(MAX_RAM_BUFFER_SIZE_MB); logger.debug("Beginning indexing "+book.getName()); try { Key keyList = null; if (book.getBookCategory().equals(BookCategory.BIBLE)) { // this method is so much faster than getGlobalKeyList but not accurate e.g. some bibles are only NT keyList = PassageKeyFactory.instance().getGlobalKeyList(); } else { keyList = book.getGlobalKeyList(); } generateSearchIndexImpl(job, errors, writer, keyList, 0); } catch (Exception e) { e.printStackTrace(); // TRANSLATOR: The search index could not be moved to it's final location. throw new BookException(JSMsg.gettext("Installation failed.")); } logger.info("Finished indexing "+book.getName()+" starting optimisation"); // TRANSLATOR: Progress label for optimizing a search index. This may take a bit of time, so we have a label for it. job.setSectionName(JSMsg.gettext("Optimizing")); // must be 1 more than 95 for the notification to be sent through to the listener job.setWork(96); // Consolidate the index into the minimum number of files. // writer.optimize(); /* Optimize is done by addIndexes */ writer.optimize(); } finally { // writer must be closed even on error to release the Lucene Lock if (writer!=null) { writer.close(); } } job.setCancelable(false); if (!job.isFinished()) { logger.debug("Renaming "+tempPath+" to "+finalPath); if (!tempPath.renameTo(finalPath)) { // TRANSLATOR: The search index could not be moved to it's final location. throw new BookException(JSMsg.gettext("Installation failed.")); } } if (finalPath.exists()) { finalStatus = IndexStatus.DONE; } if (!errors.isEmpty()) { StringBuilder buf = new StringBuilder(); for (Key error : errors) { buf.append(error); buf.append('\n'); } // TRANSLATOR: It is likely that one or more verses could not be indexed due to errors in those verses. // This message gives a listing of them to the user. Reporter.informUser(this, JSMsg.gettext("The following verses have errors and could not be indexed\n{0}", buf)); } } } catch (Exception ex) { job.cancel(); // TRANSLATOR: Common error condition: Some error happened while creating a search index. throw new BookException(JSMsg.gettext("Failed to initialize Lucene search engine."), ex); } finally { book.setIndexStatus(finalStatus); job.done(); // ensure the temp path is gone - errors can leave it there and cause further problems CommonUtils.deleteDirectory(tempPath); } } /** * Dig down into a Key indexing as we go. */ private void generateSearchIndexImpl(Progress job, List errors, IndexWriter writer, Key key, int count) throws BookException, IOException { logger.debug("Generating search Index"); boolean hasStrongs = book.getBookMetaData().hasFeature(FeatureType.STRONGS_NUMBERS); String oldRootName = ""; int percent = 0; String rootName = ""; BookData data = null; Element osis = null; // Set up for reuse. Document doc = new Document(); Field keyField = new Field(FIELD_KEY, "", Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO); Field bodyField = new Field(FIELD_BODY, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO); Field strongField = new Field(FIELD_STRONG, "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.NO); int size = key.getCardinality(); logger.debug("Number of keys:"+size); int subCount = count; for (Key subkey : key) { if (subkey.canHaveChildren()) { generateSearchIndexImpl(job, errors, writer, subkey, subCount); } else { // Set up DataPolice for this key. DataPolice.setKey(subkey); data = new BookData(book, subkey); osis = null; try { osis = data.getOsisFragment(); } catch (BookException e) { errors.add(subkey); continue; } // Remove all fields from the document doc.getFields().clear(); // Do the actual indexing // Always add the key keyField.setValue(subkey.getOsisRef()); doc.add(keyField); addField(doc, bodyField, OSISUtil.getCanonicalText(osis)); if (hasStrongs) { addField(doc, strongField, OSISUtil.getStrongsNumbers(osis)); } // Add the document if we added more than just the key. if (doc.getFields().size() > 1) { writer.addDocument(doc); } subCount++; // report progress but not all the time for efficiency if (subCount%50 ==0) { rootName = subkey.getRootName(); if (!rootName.equals(oldRootName)) { oldRootName = rootName; job.setSectionName(rootName); } percent = 95 * subCount / size; job.setWork(percent); // and force a garbage collect every so often System.gc(); } // This could take a long time ... Thread.yield(); if (Thread.currentThread().isInterrupted()) { break; } } } } private void addField(Document doc, Field field, String text) { if (text != null && text.length() > 0) { field.setValue(text); doc.add(field); } } /** * A synchronization lock point to prevent us from doing 2 index runs at a * time. */ private static final Object CREATING = new Object(); /** * Are we active */ @SuppressWarnings("unused") private boolean active; /** * The Book that we are indexing */ protected Book book; /** * The location of this index */ private String path; /** * The Lucene directory for the path. */ protected Directory directory; /** * The Lucene search engine */ protected Searcher searcher; }