From d896e62505a429ee27237b3302d7c04e7ff8e6df Mon Sep 17 00:00:00 2001 From: James Moger <james.moger@gitblit.com> Date: Thu, 15 Mar 2012 18:02:14 -0400 Subject: [PATCH] Refactored Lucene integration and fixed two index deleteDocument bugs --- src/com/gitblit/LuceneExecutor.java | 1182 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 files changed, 1,090 insertions(+), 92 deletions(-) diff --git a/src/com/gitblit/LuceneExecutor.java b/src/com/gitblit/LuceneExecutor.java index c9e4c73..527609e 100644 --- a/src/com/gitblit/LuceneExecutor.java +++ b/src/com/gitblit/LuceneExecutor.java @@ -15,47 +15,138 @@ */ package com.gitblit; -import java.text.MessageFormat; -import java.util.HashSet; -import java.util.Queue; -import java.util.Set; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.atomic.AtomicBoolean; +import static org.eclipse.jgit.treewalk.filter.TreeFilter.ANY_DIFF; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.text.MessageFormat; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.DateTools; +import org.apache.lucene.document.DateTools.Resolution; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.highlight.Fragmenter; +import org.apache.lucene.search.highlight.Highlighter; +import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; +import org.apache.lucene.search.highlight.QueryScorer; +import org.apache.lucene.search.highlight.SimpleHTMLFormatter; +import org.apache.lucene.search.highlight.SimpleSpanFragmenter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; +import org.eclipse.jgit.diff.DiffEntry.ChangeType; +import org.eclipse.jgit.lib.Constants; +import org.eclipse.jgit.lib.ObjectId; +import org.eclipse.jgit.lib.ObjectLoader; +import org.eclipse.jgit.lib.ObjectReader; import org.eclipse.jgit.lib.Repository; +import org.eclipse.jgit.revwalk.RevCommit; +import org.eclipse.jgit.revwalk.RevTree; +import org.eclipse.jgit.revwalk.RevWalk; +import org.eclipse.jgit.storage.file.FileBasedConfig; +import org.eclipse.jgit.treewalk.EmptyTreeIterator; +import org.eclipse.jgit.treewalk.TreeWalk; +import org.eclipse.jgit.util.FS; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.gitblit.models.RepositoryModel; +import com.gitblit.Constants.SearchObjectType; +import com.gitblit.models.IssueModel; +import com.gitblit.models.IssueModel.Attachment; +import com.gitblit.models.PathModel.PathChangeModel; +import com.gitblit.models.RefModel; +import com.gitblit.models.SearchResult; +import com.gitblit.utils.ArrayUtils; +import com.gitblit.utils.IssueUtils; import com.gitblit.utils.JGitUtils; -import com.gitblit.utils.LuceneUtils; -import com.gitblit.utils.LuceneUtils.IndexResult; +import com.gitblit.utils.StringUtils; /** - * The Lucene executor handles indexing repositories synchronously and - * asynchronously from a queue. + * The Lucene executor handles indexing and searching repositories. * * @author James Moger * */ public class LuceneExecutor implements Runnable { + + + private static final int INDEX_VERSION = 1; + private static final String FIELD_OBJECT_TYPE = "type"; + private static final String FIELD_ISSUE = "issue"; + private static final String FIELD_PATH = "path"; + private static final String FIELD_COMMIT = "commit"; + private static final String FIELD_BRANCH = "branch"; + private static final String FIELD_REPOSITORY = "repository"; + private static final String FIELD_SUMMARY = "summary"; + private static final String FIELD_CONTENT = "content"; + private static final String FIELD_AUTHOR = "author"; + private static final String FIELD_COMMITTER = "committer"; + private static final String FIELD_DATE = "date"; + private static final String FIELD_TAG = "tag"; + private static final String FIELD_LABEL = "label"; + private static final String FIELD_ATTACHMENT = "attachment"; + + private static final String CONF_FILE = "lucene.conf"; + private static final String LUCENE_DIR = "lucene"; + private static final String CONF_INDEX = "index"; + private static final String CONF_VERSION = "version"; + private static final String CONF_ALIAS = "aliases"; + private static final String CONF_BRANCH = "branches"; + + private static final Version LUCENE_VERSION = Version.LUCENE_35; + private final Logger logger = LoggerFactory.getLogger(LuceneExecutor.class); + + private final IStoredSettings storedSettings; + private final File repositoriesFolder; + + private final Map<String, IndexSearcher> searchers = new ConcurrentHashMap<String, IndexSearcher>(); + private final Map<String, IndexWriter> writers = new ConcurrentHashMap<String, IndexWriter>(); + + private final Set<String> excludedExtensions = new TreeSet<String>(Arrays.asList("7z", "arc", + "arj", "bin", "bmp", "dll", "doc", "docx", "exe", "gif", "gz", "jar", "jpg", "lib", + "lzh", "odg", "pdf", "ppt", "png", "so", "swf", "xcf", "xls", "xlsx", "zip")); - private final Queue<String> queue = new ConcurrentLinkedQueue<String>(); - - private final IStoredSettings settings; - - private final boolean isLuceneEnabled; - - private final boolean isPollingMode; - - private final AtomicBoolean firstRun = new AtomicBoolean(true); - - public LuceneExecutor(IStoredSettings settings) { - this.settings = settings; - this.isLuceneEnabled = settings.getBoolean(Keys.lucene.enable, false); - this.isPollingMode = settings.getBoolean(Keys.lucene.pollingMode, false); + private final Set<String> excludedBranches = new TreeSet<String>( + Arrays.asList("/refs/heads/gb-issues")); + + public LuceneExecutor(IStoredSettings settings, File repositoriesFolder) { + this.storedSettings = settings; + this.repositoriesFolder = repositoriesFolder; } /** @@ -64,70 +155,33 @@ * @return true if the Lucene executor is ready to index repositories */ public boolean isReady() { - return isLuceneEnabled; + return storedSettings.getBoolean(Keys.lucene.enable, false); } /** - * Returns the status of the Lucene queue. - * - * @return true, if the queue is empty + * Run is executed by the gitblit executor service at whatever frequency + * is specified in the settings. Because this is called by an executor + * service, calls will queue - i.e. there can never be concurrent execution + * of repository index updates. */ - public boolean hasEmptyQueue() { - return queue.isEmpty(); - } - - /** - * Queues a repository to be asynchronously indexed. - * - * @param repository - * @return true if the repository was queued - */ - public boolean queue(RepositoryModel repository) { - if (!isReady()) { - return false; - } - queue.add(repository.name); - return true; - } - @Override public void run() { - if (!isLuceneEnabled) { + if (!isReady()) { return; } - if (firstRun.get() || isPollingMode) { - // update all indexes on first run or if polling mode - firstRun.set(false); - queue.addAll(GitBlit.self().getRepositoryList()); - } - - Set<String> processed = new HashSet<String>(); - if (!queue.isEmpty()) { - // update the repository Lucene index - String name = null; - while ((name = queue.poll()) != null) { - if (processed.contains(name)) { - // skipping multi-queued repository - continue; - } - try { - Repository repository = GitBlit.self().getRepository(name); - if (repository == null) { - logger.warn(MessageFormat.format( - "Lucene executor could not find repository {0}. Skipping.", - name)); - continue; - } - index(name, repository); - repository.close(); - System.gc(); - processed.add(name); - } catch (Throwable e) { - logger.error(MessageFormat.format("Failed to update {0} Lucene index", - name), e); - } + for (String repositoryName : GitBlit.self().getRepositoryList()) { + Repository repository = GitBlit.self().getRepository(repositoryName); + if (repository == null) { + logger.warn(MessageFormat.format( + "Lucene executor could not find repository {0}. Skipping.", + repositoryName)); + continue; } + // TODO allow repository to bypass Lucene indexing + index(repositoryName, repository); + repository.close(); + System.gc(); } } @@ -140,34 +194,31 @@ * @param repository * the repository object */ - public void index(String name, Repository repository) { + protected void index(String name, Repository repository) { try { if (JGitUtils.hasCommits(repository)) { - if (LuceneUtils.shouldReindex(repository)) { - // (re)build the entire index - long start = System.currentTimeMillis(); - IndexResult result = LuceneUtils.reindex(name, repository); - float duration = (System.currentTimeMillis() - start)/1000f; + if (shouldReindex(repository)) { + // (re)build the entire index + IndexResult result = reindex(name, repository); + if (result.success) { if (result.commitCount > 0) { String msg = "Built {0} Lucene index from {1} commits and {2} files across {3} branches in {4} secs"; logger.info(MessageFormat.format(msg, name, - result.commitCount, result.blobCount, result.branchCount, duration)); + result.commitCount, result.blobCount, result.branchCount, result.duration())); } } else { String msg = "Could not build {0} Lucene index!"; logger.error(MessageFormat.format(msg, name)); } } else { - // update the index with latest commits - long start = System.currentTimeMillis(); - IndexResult result = LuceneUtils.updateIndex(name, repository); - float duration = (System.currentTimeMillis() - start)/1000f; + // update the index with latest commits + IndexResult result = updateIndex(name, repository); if (result.success) { if (result.commitCount > 0) { String msg = "Updated {0} Lucene index with {1} commits and {2} files across {3} branches in {4} secs"; logger.info(MessageFormat.format(msg, name, - result.commitCount, result.blobCount, result.branchCount, duration)); + result.commitCount, result.blobCount, result.branchCount, result.duration())); } } else { String msg = "Could not update {0} Lucene index!"; @@ -188,6 +239,953 @@ * */ public void close() { - LuceneUtils.close(); + // close all writers + for (String writer : writers.keySet()) { + try { + writers.get(writer).close(true); + } catch (Throwable t) { + logger.error("Failed to close Lucene writer for " + writer, t); + } + } + writers.clear(); + + // close all searchers + for (String searcher : searchers.keySet()) { + try { + searchers.get(searcher).close(); + } catch (Throwable t) { + logger.error("Failed to close Lucene searcher for " + searcher, t); + } + } + searchers.clear(); + } + + + /** + * Deletes the Lucene index for the specified repository. + * + * @param repositoryName + * @return true, if successful + */ + public boolean deleteIndex(String repositoryName) { + try { + // remove the repository index writer from the cache and close it + IndexWriter writer = writers.remove(repositoryName); + if (writer != null) { + writer.close(); + writer = null; + } + // remove the repository index searcher from the cache and close it + IndexSearcher searcher = searchers.remove(repositoryName); + if (searcher != null) { + searcher.close(); + searcher = null; + } + // delete the index folder + File repositoryFolder = new File(repositoriesFolder, repositoryName); + File luceneIndex = new File(repositoryFolder, LUCENE_DIR); + if (luceneIndex.exists()) { + org.eclipse.jgit.util.FileUtils.delete(luceneIndex, + org.eclipse.jgit.util.FileUtils.RECURSIVE); + } + // delete the config file + File luceneConfig = new File(repositoryFolder, CONF_FILE); + if (luceneConfig.exists()) { + luceneConfig.delete(); + } + return true; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + + /** + * Returns the author for the commit, if this information is available. + * + * @param commit + * @return an author or unknown + */ + private String getAuthor(RevCommit commit) { + String name = "unknown"; + try { + name = commit.getAuthorIdent().getName(); + if (StringUtils.isEmpty(name)) { + name = commit.getAuthorIdent().getEmailAddress(); + } + } catch (NullPointerException n) { + } + return name; + } + + /** + * Returns the committer for the commit, if this information is available. + * + * @param commit + * @return an committer or unknown + */ + private String getCommitter(RevCommit commit) { + String name = "unknown"; + try { + name = commit.getCommitterIdent().getName(); + if (StringUtils.isEmpty(name)) { + name = commit.getCommitterIdent().getEmailAddress(); + } + } catch (NullPointerException n) { + } + return name; + } + + /** + * Construct a keyname from the branch. + * + * @param branchName + * @return a keyname appropriate for the Git config file format + */ + private String getBranchKey(String branchName) { + return StringUtils.getSHA1(branchName); + } + + /** + * Returns the Lucene configuration for the specified repository. + * + * @param repository + * @return a config object + */ + private FileBasedConfig getConfig(Repository repository) { + File file = new File(repository.getDirectory(), CONF_FILE); + FileBasedConfig config = new FileBasedConfig(file, FS.detect()); + return config; + } + + /** + * Reads the Lucene config file for the repository to check the index + * version. If the index version is different, then rebuild the repository + * index. + * + * @param repository + * @return true of the on-disk index format is different than INDEX_VERSION + */ + protected boolean shouldReindex(Repository repository) { + try { + FileBasedConfig config = getConfig(repository); + config.load(); + int indexVersion = config.getInt(CONF_INDEX, CONF_VERSION, 0); + // reindex if versions do not match + return indexVersion != INDEX_VERSION; + } catch (Throwable t) { + } + return true; + } + + + /** + * This completely indexes the repository and will destroy any existing + * index. + * + * @param repositoryName + * @param repository + * @return IndexResult + */ + public IndexResult reindex(String repositoryName, Repository repository) { + IndexResult result = new IndexResult(); + if (!deleteIndex(repositoryName)) { + return result; + } + try { + FileBasedConfig config = getConfig(repository); + Set<String> indexedCommits = new TreeSet<String>(); + IndexWriter writer = getIndexWriter(repositoryName); + // build a quick lookup of tags + Map<String, List<String>> tags = new HashMap<String, List<String>>(); + for (RefModel tag : JGitUtils.getTags(repository, false, -1)) { + if (!tag.isAnnotatedTag()) { + // skip non-annotated tags + continue; + } + if (!tags.containsKey(tag.getObjectId())) { + tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>()); + } + tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName); + } + + ObjectReader reader = repository.newObjectReader(); + + // get the local branches + List<RefModel> branches = JGitUtils.getLocalBranches(repository, true, -1); + + // sort them by most recently updated + Collections.sort(branches, new Comparator<RefModel>() { + @Override + public int compare(RefModel ref1, RefModel ref2) { + return ref2.getDate().compareTo(ref1.getDate()); + } + }); + + // reorder default branch to first position + RefModel defaultBranch = null; + ObjectId defaultBranchId = JGitUtils.getDefaultBranch(repository); + for (RefModel branch : branches) { + if (branch.getObjectId().equals(defaultBranchId)) { + defaultBranch = branch; + break; + } + } + branches.remove(defaultBranch); + branches.add(0, defaultBranch); + + // walk through each branch + for (RefModel branch : branches) { + if (excludedBranches.contains(branch.getName())) { + continue; + } + + String branchName = branch.getName(); + RevWalk revWalk = new RevWalk(reader); + RevCommit tip = revWalk.parseCommit(branch.getObjectId()); + String tipId = tip.getId().getName(); + + String keyName = getBranchKey(branchName); + config.setString(CONF_ALIAS, null, keyName, branchName); + config.setString(CONF_BRANCH, null, keyName, tipId); + + // index the blob contents of the tree + TreeWalk treeWalk = new TreeWalk(repository); + treeWalk.addTree(tip.getTree()); + treeWalk.setRecursive(true); + + Map<String, ObjectId> paths = new TreeMap<String, ObjectId>(); + while (treeWalk.next()) { + paths.put(treeWalk.getPathString(), treeWalk.getObjectId(0)); + } + + ByteArrayOutputStream os = new ByteArrayOutputStream(); + byte[] tmp = new byte[32767]; + + RevWalk commitWalk = new RevWalk(reader); + commitWalk.markStart(tip); + + RevCommit commit; + while ((paths.size() > 0) && (commit = commitWalk.next()) != null) { + TreeWalk diffWalk = new TreeWalk(reader); + int parentCount = commit.getParentCount(); + switch (parentCount) { + case 0: + diffWalk.addTree(new EmptyTreeIterator()); + break; + case 1: + diffWalk.addTree(getTree(commitWalk, commit.getParent(0))); + break; + default: + // skip merge commits + continue; + } + diffWalk.addTree(getTree(commitWalk, commit)); + diffWalk.setFilter(ANY_DIFF); + diffWalk.setRecursive(true); + while ((paths.size() > 0) && diffWalk.next()) { + String path = diffWalk.getPathString(); + if (!paths.containsKey(path)) { + continue; + } + + // remove path from set + ObjectId blobId = paths.remove(path); + result.blobCount++; + + // index the blob metadata + String blobAuthor = getAuthor(commit); + String blobCommitter = getCommitter(commit); + String blobDate = DateTools.timeToString(commit.getCommitTime() * 1000L, + Resolution.MINUTE); + + Document doc = new Document(); + doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); + doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_PATH, path, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO)); + doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED)); + + // determine extension to compare to the extension + // blacklist + String ext = null; + String name = path.toLowerCase(); + if (name.indexOf('.') > -1) { + ext = name.substring(name.lastIndexOf('.') + 1); + } + + // index the blob content + if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) { + ObjectLoader ldr = repository.open(blobId, Constants.OBJ_BLOB); + InputStream in = ldr.openStream(); + int n; + while ((n = in.read(tmp)) > 0) { + os.write(tmp, 0, n); + } + in.close(); + byte[] content = os.toByteArray(); + String str = new String(content, Constants.CHARACTER_ENCODING); + doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED)); + os.reset(); + } + + // add the blob to the index + writer.addDocument(doc); + } + } + + os.close(); + + // index the tip commit object + if (indexedCommits.add(tipId)) { + Document doc = createDocument(tip, tags.get(tipId)); + doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED)); + writer.addDocument(doc); + result.commitCount += 1; + result.branchCount += 1; + } + + // traverse the log and index the previous commit objects + RevWalk historyWalk = new RevWalk(reader); + historyWalk.markStart(historyWalk.parseCommit(tip.getId())); + RevCommit rev; + while ((rev = historyWalk.next()) != null) { + String hash = rev.getId().getName(); + if (indexedCommits.add(hash)) { + Document doc = createDocument(rev, tags.get(hash)); + doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED)); + writer.addDocument(doc); + result.commitCount += 1; + } + } + } + + // finished + reader.release(); + + // this repository has a gb-issues branch, index all issues + if (IssueUtils.getIssuesBranch(repository) != null) { + List<IssueModel> issues = IssueUtils.getIssues(repository, null); + if (issues.size() > 0) { + result.branchCount += 1; + } + for (IssueModel issue : issues) { + result.issueCount++; + Document doc = createDocument(issue); + doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED)); + writer.addDocument(doc); + } + } + + // commit all changes and reset the searcher + config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION); + config.save(); + resetIndexSearcher(repositoryName); + writer.commit(); + result.success(); + } catch (Exception e) { + logger.error("Exception while reindexing " + repositoryName, e); + } + return result; + } + + /** + * Get the tree associated with the given commit. + * + * @param walk + * @param commit + * @return tree + * @throws IOException + */ + protected RevTree getTree(final RevWalk walk, final RevCommit commit) + throws IOException { + final RevTree tree = commit.getTree(); + if (tree != null) { + return tree; + } + walk.parseHeaders(commit); + return commit.getTree(); + } + + /** + * Incrementally update the index with the specified commit for the + * repository. + * + * @param repositoryName + * @param repository + * @param branch + * the fully qualified branch name (e.g. refs/heads/master) + * @param commit + * @return true, if successful + */ + private IndexResult index(String repositoryName, Repository repository, + String branch, RevCommit commit) { + IndexResult result = new IndexResult(); + try { + if (excludedBranches.contains(branch)) { + if (IssueUtils.GB_ISSUES.equals(branch)) { + // index an issue + String issueId = commit.getShortMessage().substring(2).trim(); + IssueModel issue = IssueUtils.getIssue(repository, issueId); + if (issue == null) { + // issue was deleted, remove from index + deleteIssue(repositoryName, issueId); + result.success = true; + return result; + } + result.success = index(repositoryName, issue); + result.issueCount++; + return result; + + } + return result; + } + List<PathChangeModel> changedPaths = JGitUtils.getFilesInCommit(repository, commit); + String revDate = DateTools.timeToString(commit.getCommitTime() * 1000L, + Resolution.MINUTE); + IndexWriter writer = getIndexWriter(repositoryName); + for (PathChangeModel path : changedPaths) { + // delete the indexed blob + deleteBlob(repositoryName, branch, path.path); + + // re-index the blob + if (!ChangeType.DELETE.equals(path.changeType)) { + result.blobCount++; + Document doc = new Document(); + doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), Store.YES, + Index.NOT_ANALYZED)); + doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branch, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_PATH, path.path, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_DATE, revDate, Store.YES, Index.NO)); + doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED)); + + // determine extension to compare to the extension + // blacklist + String ext = null; + String name = path.name.toLowerCase(); + if (name.indexOf('.') > -1) { + ext = name.substring(name.lastIndexOf('.') + 1); + } + + if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) { + // read the blob content + String str = JGitUtils.getStringContent(repository, commit.getTree(), + path.path); + doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED)); + writer.addDocument(doc); + } + } + } + writer.commit(); + + Document doc = createDocument(commit, null); + result.commitCount++; + result.success = index(repositoryName, doc); + } catch (Exception e) { + logger.error(MessageFormat.format("Exception while indexing commit {0} in {1}", commit.getId().getName(), repositoryName), e); + } + return result; + } + + /** + * Incrementally update the index with the specified issue for the + * repository. + * + * @param repositoryName + * @param issue + * @return true, if successful + */ + public boolean index(String repositoryName, IssueModel issue) { + try { + // delete the old issue from the index, if exists + deleteIssue(repositoryName, issue.id); + Document doc = createDocument(issue); + return index(repositoryName, doc); + } catch (Exception e) { + logger.error(MessageFormat.format("Error while indexing issue {0} in {1}", issue.id, repositoryName), e); + } + return false; + } + + /** + * Delete an issue from the repository index. + * + * @param repositoryName + * @param issueId + * @throws Exception + */ + private void deleteIssue(String repositoryName, String issueId) throws Exception { + BooleanQuery query = new BooleanQuery(); + Term objectTerm = new Term(FIELD_OBJECT_TYPE, SearchObjectType.issue.name()); + query.add(new TermQuery(objectTerm), Occur.MUST); + Term issueidTerm = new Term(FIELD_ISSUE, issueId); + query.add(new TermQuery(issueidTerm), Occur.MUST); + + IndexWriter writer = getIndexWriter(repositoryName); + writer.deleteDocuments(query); + writer.commit(); + } + + /** + * Delete a blob from the specified branch of the repository index. + * + * @param repositoryName + * @param branch + * @param path + * @throws Exception + */ + private void deleteBlob(String repositoryName, String branch, String path) throws Exception { + BooleanQuery query = new BooleanQuery(); + Term objectTerm = new Term(FIELD_OBJECT_TYPE, SearchObjectType.blob.name()); + query.add(new TermQuery(objectTerm), Occur.MUST); + Term branchTerm = new Term(FIELD_BRANCH, branch); + query.add(new TermQuery(branchTerm), Occur.MUST); + Term pathTerm = new Term(FIELD_PATH, path); + query.add(new TermQuery(pathTerm), Occur.MUST); + + IndexWriter writer = getIndexWriter(repositoryName); + writer.deleteDocuments(query); + writer.commit(); + } + + /** + * Updates a repository index incrementally from the last indexed commits. + * + * @param repositoryName + * @param repository + * @return IndexResult + */ + protected IndexResult updateIndex(String repositoryName, Repository repository) { + IndexResult result = new IndexResult(); + try { + FileBasedConfig config = getConfig(repository); + config.load(); + + // build a quick lookup of annotated tags + Map<String, List<String>> tags = new HashMap<String, List<String>>(); + for (RefModel tag : JGitUtils.getTags(repository, false, -1)) { + if (!tag.isAnnotatedTag()) { + // skip non-annotated tags + continue; + } + if (!tags.containsKey(tag.getObjectId())) { + tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>()); + } + tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName); + } + + // detect branch deletion + // first assume all branches are deleted and then remove each + // existing branch from deletedBranches during indexing + Set<String> deletedBranches = new TreeSet<String>(); + for (String alias : config.getNames(CONF_ALIAS)) { + String branch = config.getString(CONF_ALIAS, null, alias); + deletedBranches.add(branch); + } + + // walk through each branches + List<RefModel> branches = JGitUtils.getLocalBranches(repository, true, -1); + for (RefModel branch : branches) { + String branchName = branch.getName(); + + // remove this branch from the deletedBranches set + deletedBranches.remove(branchName); + + // determine last commit + String keyName = getBranchKey(branchName); + String lastCommit = config.getString(CONF_BRANCH, null, keyName); + + List<RevCommit> revs; + if (StringUtils.isEmpty(lastCommit)) { + // new branch/unindexed branch, get all commits on branch + revs = JGitUtils.getRevLog(repository, branchName, 0, -1); + } else { + // pre-existing branch, get changes since last commit + revs = JGitUtils.getRevLog(repository, lastCommit, branchName); + } + + if (revs.size() > 0) { + result.branchCount += 1; + } + + // reverse the list of commits so we start with the first commit + Collections.reverse(revs); + for (RevCommit commit : revs) { + result.add(index(repositoryName, repository, branchName, commit)); + } + + // update the config + config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION); + config.setString(CONF_ALIAS, null, keyName, branchName); + config.setString(CONF_BRANCH, null, keyName, branch.getObjectId().getName()); + config.save(); + } + + // the deletedBranches set will normally be empty by this point + // unless a branch really was deleted and no longer exists + if (deletedBranches.size() > 0) { + for (String branch : deletedBranches) { + IndexWriter writer = getIndexWriter(repositoryName); + writer.deleteDocuments(new Term(FIELD_BRANCH, branch)); + writer.commit(); + } + } + result.success = true; + } catch (Throwable t) { + logger.error(MessageFormat.format("Exception while updating {0} Lucene index", repositoryName), t); + } + return result; + } + + /** + * Creates a Lucene document from an issue. + * + * @param issue + * @return a Lucene document + */ + private Document createDocument(IssueModel issue) { + Document doc = new Document(); + doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.issue.name(), Store.YES, + Field.Index.NOT_ANALYZED)); + doc.add(new Field(FIELD_ISSUE, issue.id, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_BRANCH, IssueUtils.GB_ISSUES, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_DATE, DateTools.dateToString(issue.created, Resolution.MINUTE), + Store.YES, Field.Index.NO)); + doc.add(new Field(FIELD_AUTHOR, issue.reporter, Store.YES, Index.ANALYZED)); + List<String> attachments = new ArrayList<String>(); + for (Attachment attachment : issue.getAttachments()) { + attachments.add(attachment.name.toLowerCase()); + } + doc.add(new Field(FIELD_ATTACHMENT, StringUtils.flattenStrings(attachments), Store.YES, + Index.ANALYZED)); + doc.add(new Field(FIELD_SUMMARY, issue.summary, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_CONTENT, issue.toString(), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_LABEL, StringUtils.flattenStrings(issue.getLabels()), Store.YES, + Index.ANALYZED)); + return doc; + } + + /** + * Creates a Lucene document for a commit + * + * @param commit + * @param tags + * @return a Lucene document + */ + private Document createDocument(RevCommit commit, List<String> tags) { + Document doc = new Document(); + doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.commit.name(), Store.YES, + Index.NOT_ANALYZED)); + doc.add(new Field(FIELD_COMMIT, commit.getName(), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_DATE, DateTools.timeToString(commit.getCommitTime() * 1000L, + Resolution.MINUTE), Store.YES, Index.NO)); + doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.YES, Index.ANALYZED)); + if (!ArrayUtils.isEmpty(tags)) { + doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), Store.YES, Index.ANALYZED)); + } + return doc; + } + + /** + * Incrementally index an object for the repository. + * + * @param repositoryName + * @param doc + * @return true, if successful + */ + private boolean index(String repositoryName, Document doc) { + try { + doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.NOT_ANALYZED)); + IndexWriter writer = getIndexWriter(repositoryName); + writer.addDocument(doc); + resetIndexSearcher(repositoryName); + writer.commit(); + return true; + } catch (Exception e) { + logger.error(MessageFormat.format("Exception while incrementally updating {0} Lucene index", repositoryName), e); + } + return false; + } + + private SearchResult createSearchResult(Document doc, float score) throws ParseException { + SearchResult result = new SearchResult(); + result.score = score; + result.date = DateTools.stringToDate(doc.get(FIELD_DATE)); + result.summary = doc.get(FIELD_SUMMARY); + result.author = doc.get(FIELD_AUTHOR); + result.committer = doc.get(FIELD_COMMITTER); + result.type = SearchObjectType.fromName(doc.get(FIELD_OBJECT_TYPE)); + result.repository = doc.get(FIELD_REPOSITORY); + result.branch = doc.get(FIELD_BRANCH); + result.commitId = doc.get(FIELD_COMMIT); + result.issueId = doc.get(FIELD_ISSUE); + result.path = doc.get(FIELD_PATH); + if (doc.get(FIELD_TAG) != null) { + result.tags = StringUtils.getStringsFromValue(doc.get(FIELD_TAG)); + } + if (doc.get(FIELD_LABEL) != null) { + result.labels = StringUtils.getStringsFromValue(doc.get(FIELD_LABEL)); + } + return result; + } + + private synchronized void resetIndexSearcher(String repository) throws IOException { + IndexSearcher searcher = searchers.remove(repository); + if (searcher != null) { + searcher.close(); + } + } + + /** + * Gets an index searcher for the repository. + * + * @param repository + * @return + * @throws IOException + */ + private IndexSearcher getIndexSearcher(String repository) throws IOException { + IndexSearcher searcher = searchers.get(repository); + if (searcher == null) { + IndexWriter writer = getIndexWriter(repository); + searcher = new IndexSearcher(IndexReader.open(writer, true)); + searchers.put(repository, searcher); + } + return searcher; + } + + /** + * Gets an index writer for the repository. The index will be created if it + * does not already exist or if forceCreate is specified. + * + * @param repository + * @return an IndexWriter + * @throws IOException + */ + private IndexWriter getIndexWriter(String repository) throws IOException { + IndexWriter indexWriter = writers.get(repository); + File repositoryFolder = new File(repositoriesFolder, repository); + File indexFolder = new File(repositoryFolder, LUCENE_DIR); + Directory directory = FSDirectory.open(indexFolder); + + if (indexWriter == null) { + if (!indexFolder.exists()) { + indexFolder.mkdirs(); + } + StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION); + IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer); + config.setOpenMode(OpenMode.CREATE_OR_APPEND); + indexWriter = new IndexWriter(directory, config); + writers.put(repository, indexWriter); + } + return indexWriter; + } + + /** + * Searches the specified repositories for the given text or query + * + * @param text + * if the text is null or empty, null is returned + * @param maximumHits + * the maximum number of hits to collect + * @param repositories + * a list of repositories to search. if no repositories are + * specified null is returned. + * @return a list of SearchResults in order from highest to the lowest score + * + */ + public List<SearchResult> search(String text, int maximumHits, List<String> repositories) { + if (ArrayUtils.isEmpty(repositories)) { + return null; + } + return search(text, maximumHits, repositories.toArray(new String[0])); + } + + /** + * Searches the specified repositories for the given text or query + * + * @param text + * if the text is null or empty, null is returned + * @param maximumHits + * the maximum number of hits to collect + * @param repositories + * a list of repositories to search. if no repositories are + * specified null is returned. + * @return a list of SearchResults in order from highest to the lowest score + * + */ + public List<SearchResult> search(String text, int maximumHits, String... repositories) { + if (StringUtils.isEmpty(text)) { + return null; + } + if (ArrayUtils.isEmpty(repositories)) { + return null; + } + Set<SearchResult> results = new LinkedHashSet<SearchResult>(); + StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION); + try { + // default search checks summary and content + BooleanQuery query = new BooleanQuery(); + QueryParser qp; + qp = new QueryParser(LUCENE_VERSION, FIELD_SUMMARY, analyzer); + qp.setAllowLeadingWildcard(true); + query.add(qp.parse(text), Occur.SHOULD); + + qp = new QueryParser(LUCENE_VERSION, FIELD_CONTENT, analyzer); + qp.setAllowLeadingWildcard(true); + query.add(qp.parse(text), Occur.SHOULD); + + IndexSearcher searcher; + if (repositories.length == 1) { + // single repository search + searcher = getIndexSearcher(repositories[0]); + } else { + // multiple repository search + List<IndexReader> readers = new ArrayList<IndexReader>(); + for (String repository : repositories) { + IndexSearcher repositoryIndex = getIndexSearcher(repository); + readers.add(repositoryIndex.getIndexReader()); + } + IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]); + MultiReader reader = new MultiReader(rdrs); + searcher = new IndexSearcher(reader); + } + Query rewrittenQuery = searcher.rewrite(query); + TopScoreDocCollector collector = TopScoreDocCollector.create(maximumHits, true); + searcher.search(rewrittenQuery, collector); + ScoreDoc[] hits = collector.topDocs().scoreDocs; + for (int i = 0; i < hits.length; i++) { + int docId = hits[i].doc; + Document doc = searcher.doc(docId); + // TODO identify the source index for the doc, then eliminate FIELD_REPOSITORY + SearchResult result = createSearchResult(doc, hits[i].score); + String content = doc.get(FIELD_CONTENT); + result.fragment = getHighlightedFragment(analyzer, query, content, result); + results.add(result); + } + } catch (Exception e) { + logger.error(MessageFormat.format("Exception while searching for {0}", text), e); + } + return new ArrayList<SearchResult>(results); + } + + /** + * + * @param analyzer + * @param query + * @param content + * @param result + * @return + * @throws IOException + * @throws InvalidTokenOffsetsException + */ + private String getHighlightedFragment(Analyzer analyzer, Query query, + String content, SearchResult result) throws IOException, InvalidTokenOffsetsException { + content = content == null ? "":StringUtils.escapeForHtml(content, false); + + QueryScorer scorer = new QueryScorer(query, "content"); + Fragmenter fragmenter; + + // TODO improve the fragmenter - hopefully on line breaks + if (SearchObjectType.commit == result.type) { + fragmenter = new SimpleSpanFragmenter(scorer, 1024); + } else { + fragmenter = new SimpleSpanFragmenter(scorer, 150); + } + + // use an artificial delimiter for the token + String termTag = "<!--["; + String termTagEnd = "]-->"; + SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(termTag, termTagEnd); + Highlighter highlighter = new Highlighter(formatter, scorer); + highlighter.setTextFragmenter(fragmenter); + + String [] fragments = highlighter.getBestFragments(analyzer, "content", content, 5); + if (ArrayUtils.isEmpty(fragments)) { + if (SearchObjectType.blob == result.type) { + return ""; + } + return "<pre class=\"text\">" + content + "</pre>"; + } + StringBuilder sb = new StringBuilder(); + for (int i = 0, len = fragments.length; i < len; i++) { + String fragment = fragments[i]; + + // resurrect the raw fragment from removing the artificial delimiters + String raw = fragment.replace(termTag, "").replace(termTagEnd, ""); + sb.append(getPreTag(result, raw, content)); + + // replace the artificial delimiter with html tags + String html = fragment.replace(termTag, "<span class=\"highlight\">").replace(termTagEnd, "</span>"); + sb.append(html); + sb.append("</pre>"); + if (i < len - 1) { + sb.append("<span class=\"ellipses\">...</span><br/>"); + } + } + return sb.toString(); + } + + /** + * Returns the appropriate tag for a fragment. Commit messages are visually + * differentiated from blob fragments. + * + * @param result + * @param fragment + * @param content + * @return an html tag appropriate for the fragment + */ + private String getPreTag(SearchResult result, String fragment, String content) { + String pre = "<pre class=\"text\">"; + if (SearchObjectType.blob == result.type) { + int line = StringUtils.countLines(content.substring(0, content.indexOf(fragment))); + int lastDot = result.path.lastIndexOf('.'); + if (lastDot > -1) { + String ext = result.path.substring(lastDot + 1).toLowerCase(); + pre = MessageFormat.format("<pre class=\"prettyprint linenums:{0,number,0} lang-{1}\">", line, ext); + } else { + pre = MessageFormat.format("<pre class=\"prettyprint linenums:{0,number,0}\">", line); + } + } + return pre; + } + + /** + * Simple class to track the results of an index update. + */ + private class IndexResult { + long startTime = System.currentTimeMillis(); + long endTime = startTime; + boolean success; + int branchCount; + int commitCount; + int blobCount; + int issueCount; + + void add(IndexResult result) { + this.branchCount += result.branchCount; + this.commitCount += result.commitCount; + this.blobCount += result.blobCount; + this.issueCount += result.issueCount; + } + + void success() { + success = true; + endTime = System.currentTimeMillis(); + } + + float duration() { + return (endTime - startTime)/1000f; + } } } -- Gitblit v1.9.1