James Moger
2012-02-17 36ee2307ea083cfe8994b13a3023bc779b2b23ec
Implemented multi-repository Lucene search
4 files modified
122 ■■■■ changed files
src/com/gitblit/models/SearchResult.java 4 ●●● patch | view | raw | blame | history
src/com/gitblit/utils/LuceneUtils.java 87 ●●●● patch | view | raw | blame | history
tests/com/gitblit/tests/IssuesTest.java 4 ●●●● patch | view | raw | blame | history
tests/com/gitblit/tests/LuceneUtilsTest.java 27 ●●●●● patch | view | raw | blame | history
src/com/gitblit/models/SearchResult.java
@@ -25,6 +25,8 @@
    public String committer;
    public String summary;
    public String repository;
    public String id;
@@ -37,6 +39,6 @@
    @Override
    public String toString() {
        return type.name() + ": " + id;
        return  score + " : " + type.name() + " : " + repository + " : " + id;
    }
}
src/com/gitblit/utils/LuceneUtils.java
@@ -8,7 +8,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -26,6 +26,7 @@
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
@@ -90,13 +91,32 @@
    private static final String FIELD_LABEL = "label";
    private static final String FIELD_ATTACHMENT = "attachment";
    private static Set<String> excludes = new TreeSet<String>(Arrays.asList("7z", "arc", "arj",
            "bin", "bmp", "dll", "doc", "docx", "exe", "gif", "gz", "jar", "jpg", "lib", "lzh",
            "odg", "pdf", "ppt", "png", "so", "swf", "xcf", "xls", "xlsx", "zip"));
    private static Set<String> excludedExtensions = new TreeSet<String>(
            Arrays.asList("7z", "arc", "arj", "bin", "bmp", "dll", "doc",
                    "docx", "exe", "gif", "gz", "jar", "jpg", "lib", "lzh",
                    "odg", "pdf", "ppt", "png", "so", "swf", "xcf", "xls",
                    "xlsx", "zip"));
    private static Set<String> excludedBranches = new TreeSet<String>(
            Arrays.asList("/refs/heads/gb-issues"));
    private static final Map<File, IndexSearcher> SEARCHERS = new ConcurrentHashMap<File, IndexSearcher>();
    private static final Map<File, IndexWriter> WRITERS = new ConcurrentHashMap<File, IndexWriter>();
    /**
     * Returns the name of the repository.
     *
     * @param repository
     * @return the repository name
     */
    private static String getName(Repository repository) {
        if (repository.isBare()) {
            return repository.getDirectory().getName();
        } else {
            return repository.getDirectory().getParentFile().getName();
        }
    }
    /**
     * Deletes the Lucene index for the specified repository.
     * 
@@ -125,6 +145,7 @@
     */
    public static boolean index(Repository repository) {
        try {
            String repositoryName = getName(repository);
            Set<String> indexedCommits = new TreeSet<String>();
            IndexWriter writer = getIndexWriter(repository, true);
            // build a quick lookup of tags
@@ -139,6 +160,9 @@
            // walk through each branch
            List<RefModel> branches = JGitUtils.getLocalBranches(repository, true, -1);
            for (RefModel branch : branches) {
                if (excludedBranches.contains(branch.getName())) {
                    continue;
                }
                RevWalk revWalk = new RevWalk(repository);
                RevCommit rev = revWalk.parseCommit(branch.getObjectId());
@@ -154,6 +178,8 @@
                    Document doc = new Document();
                    doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES,
                            Index.NOT_ANALYZED_NO_NORMS));
                    doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
                            Index.NOT_ANALYZED));
                    doc.add(new Field(FIELD_OBJECT_ID, treeWalk.getPathString(), Store.YES,
                            Index.NOT_ANALYZED));
                    doc.add(new Field(FIELD_DATE, revDate, Store.YES, Index.NO));
@@ -171,7 +197,7 @@
                        ext = name.substring(name.lastIndexOf('.') + 1);
                    }
                    if (StringUtils.isEmpty(ext) || !excludes.contains(ext)) {
                    if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
                        // read the blob content
                        ObjectId entid = treeWalk.getObjectId(0);
                        FileMode entmode = treeWalk.getFileMode(0);
@@ -199,6 +225,8 @@
                String head = rev.getId().getName();
                if (indexedCommits.add(head)) {
                    Document doc = createDocument(rev, tags.get(head));
                    doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
                            Index.NOT_ANALYZED));
                    writer.addDocument(doc);
                }
@@ -208,6 +236,8 @@
                    String hash = rev.getId().getName();
                    if (indexedCommits.add(hash)) {
                        Document doc = createDocument(rev, tags.get(hash));
                        doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
                                Index.NOT_ANALYZED));
                        writer.addDocument(doc);
                    }
                }
@@ -221,6 +251,8 @@
                List<IssueModel> issues = IssueUtils.getIssues(repository, null);
                for (IssueModel issue : issues) {
                    Document doc = createDocument(issue);
                    doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
                            Index.NOT_ANALYZED));
                    writer.addDocument(doc);
                }
            }
@@ -344,6 +376,9 @@
     */
    private static boolean index(Repository repository, Document doc) {
        try {
            String repositoryName = getName(repository);
            doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
                    Index.NOT_ANALYZED));
            IndexWriter writer = getIndexWriter(repository, false);
            writer.addDocument(doc);
            resetIndexSearcher(repository);
@@ -363,6 +398,7 @@
        result.author = doc.get(FIELD_AUTHOR);
        result.committer = doc.get(FIELD_COMMITTER);
        result.type = ObjectType.fromName(doc.get(FIELD_OBJECT_TYPE));
        result.repository = doc.get(FIELD_REPOSITORY);
        result.id = doc.get(FIELD_OBJECT_ID);
        if (doc.get(FIELD_LABEL) != null) {
            result.labels = StringUtils.getStringsFromValue(doc.get(FIELD_LABEL));
@@ -437,17 +473,27 @@
    }
    /**
     * Search the repository for the given text or query
     * Searches the specified repositories for the given text or query
     * 
     * @param repository
     * @param text
     * @return a list of SearchResults
     *            if the text is null or empty, null is returned
     * @param maximumHits
     *            the maximum number of hits to collect
     * @param repositories
     *            a list of repositories to search. if no repositories are
     *            specified null is returned.
     * @return a list of SearchResults in order from highest to the lowest score
     *
     */
    public static List<SearchResult> search(Repository repository, String text) {
    public static List<SearchResult> search(String text, int maximumHits,
            Repository... repositories) {
        if (StringUtils.isEmpty(text)) {
            return null;
        }
        Set<SearchResult> results = new HashSet<SearchResult>();
        if (repositories.length == 0) {
            return null;
        }
        Set<SearchResult> results = new LinkedHashSet<SearchResult>();
        StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
        try {
            // default search checks summary and content
@@ -461,10 +507,23 @@
            qp.setAllowLeadingWildcard(true);
            query.add(qp.parse(text), Occur.SHOULD);
            IndexSearcher searcher = getIndexSearcher(repository);
            IndexSearcher searcher;
            if (repositories.length == 1) {
                // single repository search
                searcher = getIndexSearcher(repositories[0]);
            } else {
                // multiple repository search
                List<IndexReader> readers = new ArrayList<IndexReader>();
                for (Repository repository : repositories) {
                    IndexSearcher repositoryIndex = getIndexSearcher(repository);
                    readers.add(repositoryIndex.getIndexReader());
                }
                IndexReader [] rdrs = readers.toArray(new IndexReader[readers.size()]);
                MultiReader reader = new MultiReader(rdrs);
                searcher = new IndexSearcher(reader);
            }
            Query rewrittenQuery = searcher.rewrite(query);
            TopScoreDocCollector collector = TopScoreDocCollector.create(200, true);
            TopScoreDocCollector collector = TopScoreDocCollector.create(maximumHits, true);
            searcher.search(rewrittenQuery, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
            for (int i = 0; i < hits.length; i++) {
@@ -477,7 +536,7 @@
            e.printStackTrace();
        }
        return new ArrayList<SearchResult>(results);
    }
    }
    /**
     * Close all the index writers and searchers
tests/com/gitblit/tests/IssuesTest.java
@@ -146,7 +146,7 @@
        for (IssueModel issue : allIssues) {
            LuceneUtils.index(repository, issue, false);
        }
        List<SearchResult> hits = LuceneUtils.search(repository, "working");
        List<SearchResult> hits = LuceneUtils.search("working", 10, repository);
        assertTrue(hits.size() > 0);
        
        // reindex an issue
@@ -164,7 +164,7 @@
    @Test
    public void testLuceneQuery() throws Exception {
        Repository repository = GitBlitSuite.getIssuesTestRepository();
        List<SearchResult> hits = LuceneUtils.search(repository, "working");
        List<SearchResult> hits = LuceneUtils.search("working", 10, repository);
        LuceneUtils.close();
        repository.close();
        assertTrue(hits.size() > 0);
tests/com/gitblit/tests/LuceneUtilsTest.java
@@ -57,48 +57,48 @@
    public void testQuery() throws Exception {
        // 2 occurrences on the master branch
        Repository repository = GitBlitSuite.getHelloworldRepository();
        List<SearchResult> results = LuceneUtils.search(repository, "ada");
        List<SearchResult> results = LuceneUtils.search("ada", 10, repository);
        assertEquals(2, results.size());
        // author test
        results = LuceneUtils.search(repository, "author: tinogomes");
        results = LuceneUtils.search("author: tinogomes", 10, repository);
        assertEquals(2, results.size());
        repository.close();
        // blob test
        results = LuceneUtils.search(repository, "type: blob AND \"import std.stdio\"");
        results = LuceneUtils.search("type: blob AND \"import std.stdio\"", 10, repository);
        assertEquals(1, results.size());
        assertEquals("d.D", results.get(0).id);
        
        // 1 occurrence on the gh-pages branch
        repository = GitBlitSuite.getTheoreticalPhysicsRepository();
        results = LuceneUtils.search(repository, "\"add the .nojekyll file\"");
        results = LuceneUtils.search("\"add the .nojekyll file\"", 10, repository);
        assertEquals(1, results.size());
        assertEquals("Ondrej Certik", results.get(0).author);
        assertEquals("2648c0c98f2101180715b4d432fc58d0e21a51d7", results.get(0).id);
        
        // tag test
        results = LuceneUtils.search(repository, "\"qft split\"");
        results = LuceneUtils.search("\"qft split\"", 10, repository);
        assertEquals(1, results.size());
        assertEquals("Ondrej Certik", results.get(0).author);
        assertEquals("57c4f26f157ece24b02f4f10f5f68db1d2ce7ff5", results.get(0).id);
        assertEquals("[1st-edition]", results.get(0).labels.toString());
        results = LuceneUtils.search(repository, "type:blob AND \"src/intro.rst\"");
        results = LuceneUtils.search("type:blob AND \"src/intro.rst\"", 10, repository);
        assertEquals(4, results.size());
        
        // hash id tests
        results = LuceneUtils.search(repository, "id:57c4f26f157ece24b02f4f10f5f68db1d2ce7ff5");
        results = LuceneUtils.search("id:57c4f26f157ece24b02f4f10f5f68db1d2ce7ff5", 10, repository);
        assertEquals(1, results.size());
        results = LuceneUtils.search(repository, "id:57c4f26f157*");
        results = LuceneUtils.search("id:57c4f26f157*", 10, repository);
        assertEquals(1, results.size());
        repository.close();
        
        // annotated tag test
        repository = GitBlitSuite.getBluezGnomeRepository();
        results = LuceneUtils.search(repository, "\"release 1.8\"");
        results = LuceneUtils.search("\"release 1.8\"", 10, repository);
        assertEquals(1, results.size());
        assertEquals("[1.8]", results.get(0).labels.toString());
@@ -106,4 +106,13 @@
        
        LuceneUtils.close();
    }
    @Test
    public void testMultiSearch() throws Exception {
        List<SearchResult> results = LuceneUtils.search("test", 10,
                GitBlitSuite.getHelloworldRepository(),
                GitBlitSuite.getBluezGnomeRepository());
        LuceneUtils.close();
        assertEquals(10, results.size());
    }
}