Simple Document Search with Lucene
The way Apache Lucene speed up document search is by searching through special crafted index, instead of opening documents one-by-one during run-time. Using Lucene is a two-step process: creating index, and searching document (via index).
Part 1: Creating Index
You have to put all the text files in directory specified by “DATA_DIR”. This program will generates index files and stores them inside “INDEX_DIR” directory.
package com.crzyjcky.blog; import java.io.File; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Indexer { private static final String INDEX_DIR = "index"; private static final String DATA_DIR = "data"; private IndexWriter indexWriter; public void index(String indexDir, String dataDir) throws IOException { Directory dir = FSDirectory.open(new File(indexDir)); indexWriter = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); File[] files = new File(DATA_DIR).listFiles(); for (File file : files) { Document doc = new Document(); doc.add(new Field("contents", new FileReader(file))); doc.add(new Field("filename", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); indexWriter.addDocument(doc); System.out.println("doc added: " + file.getName()); } } public void close() throws CorruptIndexException, IOException { indexWriter.close(); } /** * @param args * @throws IOException */ public static void main(String[] args) throws IOException { System.out.println("start: " + System.currentTimeMillis()); Indexer indexer = new Indexer(); indexer.index(INDEX_DIR, DATA_DIR); indexer.close(); System.out.println("end: " + System.currentTimeMillis()); } }
Part 2: Searching Documents
Note that Lucene search through indexes, not the actual documents.
package com.crzyjcky.blog; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class Searcher { private static final String INDEX_DIR = "index"; private static final int NUM_SEARCH = 10; public void search(String indexDir, String queryString) throws IOException, ParseException { Directory dir = FSDirectory.open(new File(INDEX_DIR)); IndexSearcher indexSearcher = new IndexSearcher(dir); QueryParser queryParser = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30)); Query query = queryParser.parse(queryString); TopDocs hits = indexSearcher.search(query, NUM_SEARCH); System.out.println("#hits: " + hits.totalHits); for (ScoreDoc scoreDoc : hits.scoreDocs) { System.out.print("scoreDoc.doc: " + scoreDoc.doc + ", score: " + scoreDoc.score + " "); Document doc = indexSearcher.doc(scoreDoc.doc); System.out.println("\tfilename: " + doc.get("filename")); } indexSearcher.close(); } /** * @param args * @throws ParseException * @throws IOException */ public static void main(String[] args) throws IOException, ParseException { System.out.println("start: " + System.currentTimeMillis()); Searcher searcher = new Searcher(); // put your query here String query = "Redistribution"; searcher.search(INDEX_DIR, query); System.out.println("end: " + System.currentTimeMillis()); } }
Source code: blog-lucene