From 3d0494ad6db80843add9bf9d6eac783a691ac5a1 Mon Sep 17 00:00:00 2001
From: James Moger <james.moger@gitblit.com>
Date: Fri, 09 Mar 2012 20:45:52 -0500
Subject: [PATCH] More Lucene progress

---
 src/com/gitblit/utils/LuceneUtils.java |  261 +++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 175 insertions(+), 86 deletions(-)

diff --git a/src/com/gitblit/utils/LuceneUtils.java b/src/com/gitblit/utils/LuceneUtils.java
index e824236..fe36a83 100644
--- a/src/com/gitblit/utils/LuceneUtils.java
+++ b/src/com/gitblit/utils/LuceneUtils.java
@@ -1,3 +1,18 @@
+/*
+ * Copyright 2012 gitblit.com.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package com.gitblit.utils;
 
 import java.io.ByteArrayOutputStream;
@@ -50,9 +65,11 @@
 import org.eclipse.jgit.revwalk.RevWalk;
 import org.eclipse.jgit.storage.file.FileBasedConfig;
 import org.eclipse.jgit.treewalk.TreeWalk;
+import org.eclipse.jgit.treewalk.filter.AndTreeFilter;
+import org.eclipse.jgit.treewalk.filter.PathFilterGroup;
+import org.eclipse.jgit.treewalk.filter.TreeFilter;
 import org.eclipse.jgit.util.FS;
 
-import com.gitblit.GitBlit;
 import com.gitblit.models.IssueModel;
 import com.gitblit.models.IssueModel.Attachment;
 import com.gitblit.models.PathModel.PathChangeModel;
@@ -96,6 +113,7 @@
 	private static final String FIELD_AUTHOR = "author";
 	private static final String FIELD_COMMITTER = "committer";
 	private static final String FIELD_DATE = "date";
+	private static final String FIELD_TAG = "tag";
 	private static final String FIELD_LABEL = "label";
 	private static final String FIELD_ATTACHMENT = "attachment";
 
@@ -109,27 +127,47 @@
 	private static final Map<File, IndexSearcher> SEARCHERS = new ConcurrentHashMap<File, IndexSearcher>();
 	private static final Map<File, IndexWriter> WRITERS = new ConcurrentHashMap<File, IndexWriter>();
 
+	private static final String LUCENE_DIR = "lucene";
 	private static final String CONF_FILE = "lucene.conf";
 	private static final String CONF_INDEX = "index";
 	private static final String CONF_VERSION = "version";
 	private static final String CONF_ALIAS = "aliases";
 	private static final String CONF_BRANCH = "branches";
-
+	
 	/**
-	 * Returns the name of the repository.
+	 * Returns the author for the commit, if this information is available.
 	 * 
-	 * @param repository
-	 * @return the repository name
+	 * @param commit
+	 * @return an author or unknown
 	 */
-	private static String getName(Repository repository) {
-		String rootPath = GitBlit.getRepositoriesFolder().getAbsolutePath();
-		if (repository.isBare()) {
-			return StringUtils.getRelativePath(rootPath, repository.getDirectory()
-					.getAbsolutePath());
-		} else {
-			return StringUtils.getRelativePath(rootPath, repository.getDirectory().getParentFile()
-					.getAbsolutePath());
+	private static String getAuthor(RevCommit commit) {
+		String name = "unknown";
+		try {
+			name = commit.getAuthorIdent().getName();
+			if (StringUtils.isEmpty(name)) {
+				name = commit.getAuthorIdent().getEmailAddress();
+			}
+		} catch (NullPointerException n) {						
 		}
+		return name;
+	}
+	
+	/**
+	 * Returns the committer for the commit, if this information is available.
+	 * 
+	 * @param commit
+	 * @return an committer or unknown
+	 */
+	private static String getCommitter(RevCommit commit) {
+		String name = "unknown";
+		try {
+			name = commit.getCommitterIdent().getName();
+			if (StringUtils.isEmpty(name)) {
+				name = commit.getCommitterIdent().getEmailAddress();
+			}
+		} catch (NullPointerException n) {						
+		}
+		return name;
 	}
 
 	/**
@@ -182,7 +220,7 @@
 	 */
 	public static boolean deleteIndex(Repository repository) {
 		try {
-			File luceneIndex = new File(repository.getDirectory(), "lucene");
+			File luceneIndex = new File(repository.getDirectory(), LUCENE_DIR);
 			if (luceneIndex.exists()) {
 				org.eclipse.jgit.util.FileUtils.delete(luceneIndex,
 						org.eclipse.jgit.util.FileUtils.RECURSIVE);
@@ -201,16 +239,22 @@
 	 * This completely indexes the repository and will destroy any existing
 	 * index.
 	 * 
+	 * @param repositoryName
 	 * @param repository
+	 * @param fullIndex
+	 *            If false blob metadata is set to the HEAD revision of each
+	 *            branch.  If true, each the last commit of each blob is
+	 *            determined to properly index the author, committer, and date.
+	 *            Full indexing can be time-consuming.
 	 * @return IndexResult
 	 */
-	public static IndexResult reindex(Repository repository) {
+	public static IndexResult reindex(String repositoryName, Repository repository,
+			boolean fullIndex) {
 		IndexResult result = new IndexResult();
 		if (!LuceneUtils.deleteIndex(repository)) {
 			return result;
 		}
-		try {
-			String repositoryName = getName(repository);
+		try {			
 			FileBasedConfig config = getConfig(repository);
 			Set<String> indexedCommits = new TreeSet<String>();
 			IndexWriter writer = getIndexWriter(repository, true);
@@ -235,40 +279,64 @@
 				}
 				String branchName = branch.getName();
 				RevWalk revWalk = new RevWalk(repository);
-				RevCommit rev = revWalk.parseCommit(branch.getObjectId());
+				RevCommit branchHead = revWalk.parseCommit(branch.getObjectId());
+				String head = branchHead.getId().getName();
 
 				String keyName = getBranchKey(branchName);
 				config.setString(CONF_ALIAS, null, keyName, branchName);
-				config.setString(CONF_BRANCH, null, keyName, rev.getName());
+				config.setString(CONF_BRANCH, null, keyName, head);
 
 				// index the blob contents of the tree
 				ByteArrayOutputStream os = new ByteArrayOutputStream();
 				byte[] tmp = new byte[32767];
 				TreeWalk treeWalk = new TreeWalk(repository);
-				treeWalk.addTree(rev.getTree());
+				treeWalk.addTree(branchHead.getTree());
 				treeWalk.setRecursive(true);
-				String revDate = DateTools.timeToString(rev.getCommitTime() * 1000L,
-						Resolution.MINUTE);
+				
 				while (treeWalk.next()) {
+					result.blobCount++;
+					String blobPath = treeWalk.getPathString();
+					RevCommit blobRev = branchHead;
+					
+					RevWalk blobWalk = null;
+					if (fullIndex) {
+						// XXX this is _really_ slow, there must be a better way
+						// determine the most recent commit for this blob
+						blobWalk = new RevWalk(repository);
+						blobWalk.markStart(blobWalk.parseCommit(branch.getObjectId()));
+						TreeFilter filter = AndTreeFilter.create(
+								PathFilterGroup.createFromStrings(Collections.singleton(blobPath)),
+								TreeFilter.ANY_DIFF);
+						blobWalk.setTreeFilter(filter);
+
+						for (RevCommit commit : blobWalk) {
+							blobRev = commit;
+							break;
+						}
+					}
+					
+					String blobAuthor = getAuthor(blobRev);
+					String blobCommitter = getCommitter(blobRev);
+					String blobDate = DateTools.timeToString(blobRev.getCommitTime() * 1000L,
+							Resolution.MINUTE);
+					
+					if (blobWalk != null) {
+						blobWalk.dispose();
+					}
+					
 					Document doc = new Document();
-					doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES,
-							Index.NOT_ANALYZED_NO_NORMS));
-					doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
-							Index.NOT_ANALYZED));
-					doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.NOT_ANALYZED));
-					doc.add(new Field(FIELD_OBJECT_ID, treeWalk.getPathString(), Store.YES,
-							Index.NOT_ANALYZED));
-					doc.add(new Field(FIELD_DATE, revDate, Store.YES, Index.NO));
-					doc.add(new Field(FIELD_AUTHOR, rev.getAuthorIdent().getName(), Store.YES,
-							Index.NOT_ANALYZED_NO_NORMS));
-					doc.add(new Field(FIELD_COMMITTER, rev.getCommitterIdent().getName(),
-							Store.YES, Index.NOT_ANALYZED_NO_NORMS));
-					doc.add(new Field(FIELD_LABEL, branch.getName(), Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS));
+					doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_OBJECT_ID, blobPath, Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO));
+					doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED));					
 
 					// determine extension to compare to the extension
 					// blacklist
 					String ext = null;
-					String name = treeWalk.getPathString().toLowerCase();
+					String name = blobPath.toLowerCase();
 					if (name.indexOf('.') > -1) {
 						ext = name.substring(name.lastIndexOf('.') + 1);
 					}
@@ -298,25 +366,25 @@
 				treeWalk.release();
 
 				// index the head commit object
-				String head = rev.getId().getName();
 				if (indexedCommits.add(head)) {
-					Document doc = createDocument(rev, tags.get(head));
-					doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
-							Index.NOT_ANALYZED));
-					doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.NOT_ANALYZED));
+					Document doc = createDocument(branchHead, tags.get(head));
+					doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));
 					writer.addDocument(doc);
 					result.commitCount += 1;
+					result.branchCount += 1;
 				}
 
 				// traverse the log and index the previous commit objects
-				revWalk.markStart(rev);
+				revWalk.reset();
+				revWalk.markStart(branchHead);
+				RevCommit rev;
 				while ((rev = revWalk.next()) != null) {
 					String hash = rev.getId().getName();
 					if (indexedCommits.add(hash)) {
 						Document doc = createDocument(rev, tags.get(hash));
-						doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
-								Index.NOT_ANALYZED));
-						doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.NOT_ANALYZED));
+						doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));
+						doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));
 						writer.addDocument(doc);
 						result.commitCount += 1;
 					}
@@ -329,10 +397,13 @@
 			// this repository has a gb-issues branch, index all issues
 			if (IssueUtils.getIssuesBranch(repository) != null) {
 				List<IssueModel> issues = IssueUtils.getIssues(repository, null);
+				if (issues.size() > 0) {
+					result.branchCount += 1;
+				}
 				for (IssueModel issue : issues) {
+					result.issueCount++;
 					Document doc = createDocument(issue);
-					doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
-							Index.NOT_ANALYZED));
+					doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));
 					writer.addDocument(doc);
 				}
 			}
@@ -353,13 +424,16 @@
 	 * Incrementally update the index with the specified commit for the
 	 * repository.
 	 * 
+	 * @param repositoryName
 	 * @param repository
 	 * @param branch
 	 *            the fully qualified branch name (e.g. refs/heads/master)
 	 * @param commit
 	 * @return true, if successful
 	 */
-	public static boolean index(Repository repository, String branch, RevCommit commit) {
+	private static IndexResult index(String repositoryName, Repository repository, 
+			String branch, RevCommit commit) {
+		IndexResult result = new IndexResult();
 		try {
 			if (excludedBranches.contains(branch)) {
 				if (IssueUtils.GB_ISSUES.equals(branch)) {
@@ -367,20 +441,23 @@
 					String issueId = commit.getShortMessage().substring(2).trim();
 					IssueModel issue = IssueUtils.getIssue(repository, issueId);
 					if (issue == null) {
-						// delete the old issue from the index, if exists
+						// issue was deleted, remove from index
 						IndexWriter writer = getIndexWriter(repository, false);
 						writer.deleteDocuments(
 								new Term(FIELD_OBJECT_TYPE, ObjectType.issue.name()), new Term(
 										FIELD_OBJECT_ID, issueId));
 						writer.commit();
-						return true;
+						result.success = true;
+						return result;
 					}
-					return index(repository, issue);
+					result.success = index(repositoryName, repository, issue);
+					result.issueCount++;
+					return result;
+					
 				}
-				return false;
+				return result;
 			}
 			List<PathChangeModel> changedPaths = JGitUtils.getFilesInCommit(repository, commit);
-			String repositoryName = getName(repository);
 			String revDate = DateTools.timeToString(commit.getCommitTime() * 1000L,
 					Resolution.MINUTE);
 			IndexWriter writer = getIndexWriter(repository, false);
@@ -391,19 +468,16 @@
 
 				// re-index the blob
 				if (!ChangeType.DELETE.equals(path.changeType)) {
+					result.blobCount++;
 					Document doc = new Document();
 					doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES,
-							Index.NOT_ANALYZED_NO_NORMS));
-					doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES,
 							Index.NOT_ANALYZED));
-					doc.add(new Field(FIELD_BRANCH, branch, Store.YES, Index.NOT_ANALYZED));
-					doc.add(new Field(FIELD_OBJECT_ID, path.path, Store.YES, Index.NOT_ANALYZED));
+					doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_BRANCH, branch, Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_OBJECT_ID, path.path, Store.YES, Index.ANALYZED));
 					doc.add(new Field(FIELD_DATE, revDate, Store.YES, Index.NO));
-					doc.add(new Field(FIELD_AUTHOR, commit.getAuthorIdent().getName(), Store.YES,
-							Index.NOT_ANALYZED_NO_NORMS));
-					doc.add(new Field(FIELD_COMMITTER, commit.getCommitterIdent().getName(),
-							Store.YES, Index.NOT_ANALYZED_NO_NORMS));
-					doc.add(new Field(FIELD_LABEL, branch, Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED));
+					doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED));
 
 					// determine extension to compare to the extension
 					// blacklist
@@ -425,11 +499,12 @@
 			writer.commit();
 
 			Document doc = createDocument(commit, null);
-			return index(repository, doc);
+			result.commitCount++;
+			result.success = index(repositoryName, repository, doc);
 		} catch (Exception e) {
 			e.printStackTrace();
 		}
-		return false;
+		return result;
 	}
 
 	/**
@@ -440,7 +515,7 @@
 	 * @param issue
 	 * @return true, if successful
 	 */
-	public static boolean index(Repository repository, IssueModel issue) {
+	public static boolean index(String repositoryName, Repository repository, IssueModel issue) {
 		try {
 			// delete the old issue from the index, if exists
 			IndexWriter writer = getIndexWriter(repository, false);
@@ -449,7 +524,7 @@
 			writer.commit();
 
 			Document doc = createDocument(issue);
-			return index(repository, doc);
+			return index(repositoryName, repository, doc);
 		} catch (Exception e) {
 			e.printStackTrace();
 		}
@@ -459,10 +534,11 @@
 	/**
 	 * Updates a repository index incrementally from the last indexed commits.
 	 * 
+	 * @param repositoryName
 	 * @param repository
 	 * @return IndexResult
 	 */
-	public static IndexResult updateIndex(Repository repository) {
+	public static IndexResult updateIndex(String repositoryName, Repository repository) {
 		IndexResult result = new IndexResult();
 		try {
 			FileBasedConfig config = getConfig(repository);
@@ -511,11 +587,14 @@
 					revs = JGitUtils.getRevLog(repository, lastCommit, branchName);
 				}
 
-				// reverse the list of commits so we start with the first commit
+				if (revs.size() > 0) {
+					result.branchCount += 1;
+				}
+				
+				// reverse the list of commits so we start with the first commit				
 				Collections.reverse(revs);
 				for (RevCommit commit : revs) {
-					index(repository, branchName, commit);
-					result.commitCount += 1;
+					result.add(index(repositoryName, repository, branchName, commit));					
 				}
 
 				// update the config
@@ -550,12 +629,12 @@
 	private static Document createDocument(IssueModel issue) {
 		Document doc = new Document();
 		doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.issue.name(), Store.YES,
-				Field.Index.NOT_ANALYZED_NO_NORMS));
-		doc.add(new Field(FIELD_OBJECT_ID, issue.id, Store.YES, Index.NOT_ANALYZED));
-		doc.add(new Field(FIELD_BRANCH, IssueUtils.GB_ISSUES, Store.YES, Index.NOT_ANALYZED));
+				Field.Index.NOT_ANALYZED));
+		doc.add(new Field(FIELD_OBJECT_ID, issue.id, Store.YES, Index.ANALYZED));
+		doc.add(new Field(FIELD_BRANCH, IssueUtils.GB_ISSUES, Store.YES, Index.ANALYZED));
 		doc.add(new Field(FIELD_DATE, DateTools.dateToString(issue.created, Resolution.MINUTE),
 				Store.YES, Field.Index.NO));
-		doc.add(new Field(FIELD_AUTHOR, issue.reporter, Store.YES, Index.NOT_ANALYZED_NO_NORMS));
+		doc.add(new Field(FIELD_AUTHOR, issue.reporter, Store.YES, Index.ANALYZED));
 		List<String> attachments = new ArrayList<String>();
 		for (Attachment attachment : issue.getAttachments()) {
 			attachments.add(attachment.name.toLowerCase());
@@ -579,19 +658,16 @@
 	private static Document createDocument(RevCommit commit, List<String> tags) {
 		Document doc = new Document();
 		doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.commit.name(), Store.YES,
-				Index.NOT_ANALYZED_NO_NORMS));
-		doc.add(new Field(FIELD_OBJECT_ID, commit.getName(), Store.YES, Index.NOT_ANALYZED));
+				Index.NOT_ANALYZED));
+		doc.add(new Field(FIELD_OBJECT_ID, commit.getName(), Store.YES, Index.ANALYZED));
 		doc.add(new Field(FIELD_DATE, DateTools.timeToString(commit.getCommitTime() * 1000L,
 				Resolution.MINUTE), Store.YES, Index.NO));
-		doc.add(new Field(FIELD_AUTHOR, commit.getCommitterIdent().getName(), Store.YES,
-				Index.NOT_ANALYZED_NO_NORMS));
+		doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED));
+		doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED));
 		doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), Store.YES, Index.ANALYZED));
 		doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.NO, Index.ANALYZED));
 		if (!ArrayUtils.isEmpty(tags)) {
-			if (!ArrayUtils.isEmpty(tags)) {
-				doc.add(new Field(FIELD_LABEL, StringUtils.flattenStrings(tags), Store.YES,
-						Index.ANALYZED));
-			}
+			doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), Store.YES, Index.ANALYZED));
 		}
 		return doc;
 	}
@@ -599,13 +675,13 @@
 	/**
 	 * Incrementally index an object for the repository.
 	 * 
+	 * @param repositoryName
 	 * @param repository
 	 * @param doc
 	 * @return true, if successful
 	 */
-	private static boolean index(Repository repository, Document doc) {
-		try {
-			String repositoryName = getName(repository);
+	private static boolean index(String repositoryName, Repository repository, Document doc) {
+		try {			
 			doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.NOT_ANALYZED));
 			IndexWriter writer = getIndexWriter(repository, false);
 			writer.addDocument(doc);
@@ -629,6 +705,9 @@
 		result.repository = doc.get(FIELD_REPOSITORY);
 		result.branch = doc.get(FIELD_BRANCH);
 		result.id = doc.get(FIELD_OBJECT_ID);
+		if (doc.get(FIELD_TAG) != null) {
+			result.tags = StringUtils.getStringsFromValue(doc.get(FIELD_TAG));
+		}
 		if (doc.get(FIELD_LABEL) != null) {
 			result.labels = StringUtils.getStringsFromValue(doc.get(FIELD_LABEL));
 		}
@@ -672,7 +751,7 @@
 	private static IndexWriter getIndexWriter(Repository repository, boolean forceCreate)
 			throws IOException {
 		IndexWriter indexWriter = WRITERS.get(repository.getDirectory());
-		File indexFolder = new File(repository.getDirectory(), "lucene");
+		File indexFolder = new File(repository.getDirectory(), LUCENE_DIR);
 		Directory directory = FSDirectory.open(indexFolder);
 		if (forceCreate || !indexFolder.exists()) {
 			// if the writer is going to blow away the existing index and create
@@ -794,6 +873,16 @@
 
 	public static class IndexResult {
 		public boolean success;
+		public int branchCount;
 		public int commitCount;
+		public int blobCount;
+		public int issueCount;
+		
+		public void add(IndexResult result) {
+			this.branchCount += result.branchCount;
+			this.commitCount += result.commitCount;
+			this.blobCount += result.blobCount;
+			this.issueCount += result.issueCount;			
+		}
 	}
 }

--
Gitblit v1.9.1