Search: re-index note on every change

This commit is contained in:
Alexander Bezzubov 2015-12-11 19:11:21 +09:00
parent b2b93c436a
commit 09d44d2313
3 changed files with 112 additions and 31 deletions

View file

@ -484,6 +484,7 @@ public class Notebook {
public void close() {
this.notebookRepo.close();
this.notebookIndex.close();
}
}

View file

@ -6,6 +6,7 @@ import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
@ -19,6 +20,7 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
@ -42,14 +44,30 @@ import com.google.common.collect.Lists;
/**
* TODO(bzz): find a better name
* TODO(bzz): document thread-safety for writer
*/
public class SearchService {
private static final Logger LOG = LoggerFactory.getLogger(SearchService.class);
Directory ramDirectory;
static final String SEARCH_FIELD = "contents";
static final String ID_FIELD = "id";
Directory ramDirectory;
Analyzer analyzer;
IndexWriterConfig iwc;
IndexWriter writer;
public SearchService() {
ramDirectory = new RAMDirectory();
analyzer = new StandardAnalyzer();
iwc = new IndexWriterConfig(analyzer);
try {
writer = new IndexWriter(ramDirectory, iwc);
} catch (IOException e) {
LOG.error("Failed to reate new IndexWriter", e);
}
}
/**
* Full-text search in all the notebooks
*
@ -58,7 +76,8 @@ public class SearchService {
*/
public List<Map<String, String>> search(String queryStr) {
if (null == ramDirectory) {
throw new IllegalStateException("Please call .index() first!");
throw new IllegalStateException(
"Something went wrong on instance creation time, index dir is null");
}
List<Map<String, String>> result = Collections.emptyList();
try (IndexReader indexReader = DirectoryReader.open(ramDirectory)) {
@ -127,24 +146,45 @@ public class SearchService {
return matchingParagraphs;
}
/**
* Indexes full collection of notes: all the paragraph
*
* @param collection of Notes
*/
public void index(Collection<Note> collection) {
try {
Date start = new Date();
ramDirectory = new RAMDirectory();
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
IndexWriter writer = new IndexWriter(ramDirectory, iwc);
long start = System.nanoTime();
try { //TODO(bzz): document thread-safety
indexDocs(writer, collection);
writer.close();
Date end = new Date();
LOG.info(end.getTime() - start.getTime() + " total milliseconds");
long end = System.nanoTime();
LOG.info("Indexing {} notebooks took {}ms",
collection.size(), TimeUnit.NANOSECONDS.toMillis(end - start));
} catch (Exception e) {
LOG.error("Failed to index all Notebooks", e);
}
}
public void updateDoc(String noteId, String noteName, Paragraph p) throws IOException {
Document doc = newDocument(noteId, noteName, p);
try {
writer.updateDocument(new Term(ID_FIELD, formatId(noteId, p.getId())), doc);
writer.commit();
} catch (Exception e) {
LOG.error("Failed to index all Notebooks", e);
}
}
/**
* Frees the recourses used by Lucene index
*/
public void close() {
try {
writer.close();
} catch (IOException e) {
LOG.error("Failed to .close() the notebook index", e);
}
}
/**
* Indexes the given list of notebooks
*
@ -163,27 +203,45 @@ public class SearchService {
LOG.info("Skipping empty paragraph");
continue;
}
indexDoc(writer, note, doc);
indexDoc(writer, note.getId(), note.getName(), doc);
}
}
writer.commit();
}
/** Indexes a single paragraph = document */
void indexDoc(IndexWriter writer, Note note, Paragraph p) throws IOException {
/**
* Indexes a single paragraph = document
*/
void indexDoc(IndexWriter w, String noteId, String noteName, Paragraph p) throws IOException {
Document doc = newDocument(noteId, noteName, p);
w.addDocument(doc);
}
private Document newDocument(String noteId, String noteName, Paragraph p) {
Document doc = new Document();
// <note-id>/paragraph/<paragraph-id>
String id = String.format("%s/paragraph/%s", note.getId(), p.getId());
String id = formatId(noteId, p.getId());
Field pathField = new StringField(ID_FIELD, id, Field.Store.YES);
doc.add(pathField);
doc.add(new StringField("title", note.getName(), Field.Store.YES));
doc.add(new StringField("title", noteName, Field.Store.YES));
Date date = p.getDateStarted() != null ? p.getDateStarted() : p.getDateCreated();
doc.add(new LongField("modified", date.getTime(), Field.Store.NO));
doc.add(new TextField(SEARCH_FIELD, p.getText(), Field.Store.YES));
return doc;
}
writer.addDocument(doc);
/**
* ID looks like '<note-id>/paragraph/<paragraph-id>'
*
* @param noteId If of the Note
* @param paragraphId Id of the paragraph
*
* @return
*/
private String formatId(String noteId, String paragraphId) {
return String.format("%s/paragraph/%s", noteId, paragraphId);
}
}

View file

@ -2,6 +2,7 @@ package org.apache.zeppelin.search;
import static com.google.common.truth.Truth.assertThat;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
@ -23,9 +24,9 @@ public class SearchServiceTest {
@Test public void canIndexNotebook() {
//give
Note note1 = newNoteWithParapgraph("Notebook1", "test");
Note note2 = newNoteWithParapgraph("Notebook2", "not test");
Note note2 = newNoteWithParapgraph("Notebook2", "not test");
List<Note> notebook = Arrays.asList(note1, note2);
//when
notebookIndex.index(notebook);
}
@ -35,28 +36,49 @@ public class SearchServiceTest {
Note note1 = newNoteWithParapgraph("Notebook1", "test");
Note note2 = newNoteWithParapgraphs("Notebook2", "not test", "not test at all");
notebookIndex.index(Arrays.asList(note1, note2));
//when
List<Map<String, String>> results = notebookIndex.search("all");
//then
assertThat(results).isNotEmpty();
assertThat(results).isNotEmpty();
assertThat(results.size()).isEqualTo(1);
assertThat(results.get(0)).containsEntry("id",
String.format("%s/paragraph/%s", note2.getId(), note2.getLastParagraph().getId()));
}
@Test(expected=IllegalStateException.class)
@Test //(expected=IllegalStateException.class)
public void canNotSearchBeforeIndexing() {
//given no notebookIndex.index() was made
//when
notebookIndex.search("anything");
List<Map<String, String>> result = notebookIndex.search("anything");
//then
assertThat(result).isEmpty();
}
@Test public void canIndexAndReIndex() throws IOException {
//given
Note note1 = newNoteWithParapgraph("Notebook1", "test");
Note note2 = newNoteWithParapgraphs("Notebook2", "not test", "not test at all");
notebookIndex.index(Arrays.asList(note1, note2));
//when
Paragraph p2 = note2.getLastParagraph();
p2.setText("test indeed");
notebookIndex.updateDoc(note2.getId(), note2.getName(), p2);
//then
List<Map<String, String>> results = notebookIndex.search("all");
assertThat(results).isEmpty();
results = notebookIndex.search("indeed");
assertThat(results).isNotEmpty();
}
/**
* Creates a new Note \w given name,
* adds a new paragraph \w given text
*
*
* @param noteName name of the note
* @param parText text of the paragraph
* @return Note
@ -66,14 +88,14 @@ public class SearchServiceTest {
addParagraphWithText(note1, parText);
return note1;
}
/**
* Creates a new Note \w given name,
* adds N paragraphs \w given texts
*/
private Note newNoteWithParapgraphs(String noteName, String... parTexts) {
Note note1 = newNote(noteName);
for (String parText : parTexts) {
for (String parText : parTexts) {
addParagraphWithText(note1, parText);
}
return note1;