Please find below the Lucene sample code to index the files inside a folder. This code will index ( or create fields for ) the file path, file title, modified date and contents of the file.
This java code is expecting the index path ( where the index files will be created ) and file folder path as program arguments like "java IndexFiles [-index INDEX_PATH] [-docs DOCS_PATH]" .
The logic of the code is to iterate through each file in the folder and call the method indexDoc(), where the above said fields are created and added to a Document object. This means that for each file there will be a document object and these document objects will be added to IndexWriter.
Please find below the screen shot of the indexd file folder :
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class IndexFiles { public static void main(String[] args) { String usage = "java IndexFiles [-index INDEX_PATH] [-docs DOCS_PATH] \n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index in" + "INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; String docsPath = null; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory " + docDir.getAbsolutePath() + "does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31,analyzer); iwc.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, iwc); findFilesAndIndex(writer, docDir); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime()+ " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass()+ "\n with message: " + e.getMessage()); } } static void findFilesAndIndex(IndexWriter writer, File file) throws IOException { FileInputStream fis = null; try{ if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { findFilesAndIndex(writer, new File(file, files[i])); } } } else { fis = new FileInputStream(file); indexDoc(writer, file,fis); } } }catch (IOException e) { System.out.println(" caught a " + e.getClass()+ "\n with message: " + e.getMessage()); }finally { if(fis != null){ fis.close(); } } } static void indexDoc(IndexWriter writer, File file,FileInputStream fis) throws IOException { Document doc = new Document(); Field pathField = new Field("path", file.getPath(),Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setOmitTermFreqAndPositions(true); doc.add(pathField); Field titleField = new Field("title", file.getName(),Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setOmitTermFreqAndPositions(true); doc.add(titleField); NumericField modifiedField = new NumericField("modified"); modifiedField.setLongValue(file.lastModified()); doc.add(modifiedField); doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); System.out.println("adding " + file); writer.addDocument(doc); } }
Exact code i was looking for, awesome logic, thanks for the share.Sample Documents
ReplyDelete