From ae9e157ef4e6a3708489725d4436cc15d273308f Mon Sep 17 00:00:00 2001
From: James Moger <james.moger@gitblit.com>
Date: Thu, 07 Jun 2012 17:30:18 -0400
Subject: [PATCH] Try multiple encodings when working with string blobs (issue 97)

---
 src/com/gitblit/utils/StringUtils.java         |   42 +++++++++++++++++++++
 src/com/gitblit/GitBlit.java                   |    9 ++++
 src/com/gitblit/wicket/pages/RawPage.java      |    9 ++--
 docs/04_releases.mkd                           |    2 +
 tests/com/gitblit/tests/JGitUtilsTest.java     |    3 +
 distrib/gitblit.properties                     |   11 +++++
 src/com/gitblit/LuceneExecutor.java            |    3 +
 src/com/gitblit/PagesServlet.java              |    8 ++-
 src/com/gitblit/wicket/pages/MarkdownPage.java |    6 ++-
 src/com/gitblit/wicket/pages/BlobPage.java     |   11 +++--
 src/com/gitblit/wicket/pages/SummaryPage.java  |    3 +
 src/com/gitblit/utils/JGitUtils.java           |   11 +++--
 12 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/distrib/gitblit.properties b/distrib/gitblit.properties
index 5292a91..8e76826 100644
--- a/distrib/gitblit.properties
+++ b/distrib/gitblit.properties
@@ -362,6 +362,16 @@
 # SINCE 0.5.0
 web.repositoriesMessage = gitblit
 
+# Ordered list of charsets/encodings to use when trying to display a blob.
+# If empty, UTF-8 and ISO-8859-1 are used.  The server's default charset
+# is always appended to the encoding list.  If all encodings fail to cleanly
+# decode the blob content, UTF-8 will be used with the standard malformed
+# input/unmappable character replacement strings.
+# 
+# SPACE-DELIMITED
+# SINCE 1.0.0
+web.blobEncodings = UTF-8 ISO-8859-1
+
 # Manually set the default timezone to be used by Gitblit for display in the 
 # web ui.  This value is independent of the JVM timezone.  Specifying a blank
 # value will default to the JVM timezone.
@@ -432,6 +442,7 @@
 # e.g.
 # web.otherUrls = ssh://localhost/git/{0} git://localhost/git/{0}
 #
+# SPACE-DELIMITED
 # SINCE 0.5.0
 web.otherUrls = 
 
diff --git a/docs/04_releases.mkd b/docs/04_releases.mkd
index d20000b..0098197 100644
--- a/docs/04_releases.mkd
+++ b/docs/04_releases.mkd
@@ -16,6 +16,8 @@
 
 #### additions
 
+- Added setting to control charsets for blob string decoding.  Default encodings are UTF-8, ISO-8859-1, and server's default charset. (issue 97)  
+    **New:** *web.blobEncodings = UTF-8 ISO-8859-1*  
 - Exposed JGit's internal configuration settings in gitblit.properties/web.xml (issue 93)  
     **New:** *git.packedGitWindowSize = 8k*  
     **New:** *git.packedGitLimit = 10m*  
diff --git a/src/com/gitblit/GitBlit.java b/src/com/gitblit/GitBlit.java
index f96340a..dc53540 100644
--- a/src/com/gitblit/GitBlit.java
+++ b/src/com/gitblit/GitBlit.java
@@ -189,6 +189,15 @@
 		return self().timezone;
 	}
 	
+	/**
+	 * Returns the user-defined blob encodings.
+	 * 
+	 * @return an array of encodings, may be empty
+	 */
+	public static String [] getEncodings() {
+		return getStrings(Keys.web.blobEncodings).toArray(new String[0]);
+	}
+	
 
 	/**
 	 * Returns the boolean value for the specified key. If the key does not
diff --git a/src/com/gitblit/LuceneExecutor.java b/src/com/gitblit/LuceneExecutor.java
index afd1cc5..b316543 100644
--- a/src/com/gitblit/LuceneExecutor.java
+++ b/src/com/gitblit/LuceneExecutor.java
@@ -642,6 +642,7 @@
 			String branch, RevCommit commit) {
 		IndexResult result = new IndexResult();
 		try {
+			String [] encodings = storedSettings.getStrings(Keys.web.blobEncodings).toArray(new String[0]);
 			List<PathChangeModel> changedPaths = JGitUtils.getFilesInCommit(repository, commit);
 			String revDate = DateTools.timeToString(commit.getCommitTime() * 1000L,
 					Resolution.MINUTE);
@@ -674,7 +675,7 @@
 					if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
 						// read the blob content
 						String str = JGitUtils.getStringContent(repository, commit.getTree(),
-								path.path);
+								path.path, encodings);
 						doc.add(new Field(FIELD_CONTENT, str, Store.YES, Index.ANALYZED));
 						writer.addDocument(doc);
 					}
diff --git a/src/com/gitblit/PagesServlet.java b/src/com/gitblit/PagesServlet.java
index d6304f7..ad9276b 100644
--- a/src/com/gitblit/PagesServlet.java
+++ b/src/com/gitblit/PagesServlet.java
@@ -141,13 +141,15 @@
 			}
 			response.setDateHeader("Last-Modified", JGitUtils.getCommitDate(commit).getTime());
 
+			String [] encodings = GitBlit.getEncodings();
+
 			RevTree tree = commit.getTree();
 			byte[] content = null;
 			if (StringUtils.isEmpty(resource)) {
 				// find resource
 				String[] files = { "index.html", "index.htm", "index.mkd" };
 				for (String file : files) {
-					content = JGitUtils.getStringContent(r, tree, file)
+					content = JGitUtils.getStringContent(r, tree, file, encodings)
 							.getBytes(Constants.ENCODING);
 					if (content != null) {
 						resource = file;
@@ -165,7 +167,7 @@
 						contentType = "text/plain";
 					}
 					if (contentType.startsWith("text")) {
-						content = JGitUtils.getStringContent(r, tree, resource).getBytes(
+						content = JGitUtils.getStringContent(r, tree, resource, encodings).getBytes(
 								Constants.ENCODING);
 					} else {
 						content = JGitUtils.getByteContent(r, tree, resource);
@@ -177,7 +179,7 @@
 
 			// no content, try custom 404 page
 			if (ArrayUtils.isEmpty(content)) {
-				String custom404 = JGitUtils.getStringContent(r, tree, "404.html");
+				String custom404 = JGitUtils.getStringContent(r, tree, "404.html", encodings);
 				if (!StringUtils.isEmpty(custom404)) {
 					content = custom404.getBytes(Constants.ENCODING);
 				}
diff --git a/src/com/gitblit/utils/JGitUtils.java b/src/com/gitblit/utils/JGitUtils.java
index f5ca5ef..72a8ab3 100644
--- a/src/com/gitblit/utils/JGitUtils.java
+++ b/src/com/gitblit/utils/JGitUtils.java
@@ -20,7 +20,6 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.nio.charset.Charset;
 import java.text.MessageFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -543,14 +542,15 @@
 	 * @param tree
 	 *            if null, the RevTree from HEAD is assumed.
 	 * @param blobPath
+	 * @param charsets optional
 	 * @return UTF-8 string content
 	 */
-	public static String getStringContent(Repository repository, RevTree tree, String blobPath) {
+	public static String getStringContent(Repository repository, RevTree tree, String blobPath, String... charsets) {
 		byte[] content = getByteContent(repository, tree, blobPath);
 		if (content == null) {
 			return null;
 		}
-		return new String(content, Charset.forName(Constants.CHARACTER_ENCODING));
+		return StringUtils.decodeString(content, charsets);
 	}
 
 	/**
@@ -589,14 +589,15 @@
 	 * 
 	 * @param repository
 	 * @param objectId
+	 * @param charsets optional
 	 * @return UTF-8 string content
 	 */
-	public static String getStringContent(Repository repository, String objectId) {
+	public static String getStringContent(Repository repository, String objectId, String... charsets) {
 		byte[] content = getByteContent(repository, objectId);
 		if (content == null) {
 			return null;
 		}
-		return new String(content, Charset.forName(Constants.CHARACTER_ENCODING));
+		return StringUtils.decodeString(content, charsets);
 	}
 
 	/**
diff --git a/src/com/gitblit/utils/StringUtils.java b/src/com/gitblit/utils/StringUtils.java
index 2c35724..baed5f0 100644
--- a/src/com/gitblit/utils/StringUtils.java
+++ b/src/com/gitblit/utils/StringUtils.java
@@ -16,13 +16,23 @@
 package com.gitblit.utils;
 
 import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
+import java.util.LinkedHashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.regex.PatternSyntaxException;
 
 /**
@@ -550,4 +560,36 @@
 		// remember to append any characters to the right of a match
 		return sb.toString();
 	}
+	
+	/**
+	 * Decodes a string by trying several charsets until one does not throw a
+	 * coding exception.  Last resort is to interpret as UTF-8 with illegal
+	 * character substitution.
+	 * 
+	 * @param content
+	 * @param charsets optional
+	 * @return a string
+	 */
+	public static String decodeString(byte [] content, String... charsets) {
+		Set<String> sets = new LinkedHashSet<String>();
+		if (!ArrayUtils.isEmpty(charsets)) {
+			sets.addAll(Arrays.asList(charsets));
+		}
+		sets.addAll(Arrays.asList("UTF-8", "ISO-8859-1", Charset.defaultCharset().name()));
+		for (String charset : sets) {
+			try {
+				Charset cs = Charset.forName(charset);
+				CharsetDecoder decoder = cs.newDecoder();
+				CharBuffer buffer = decoder.decode(ByteBuffer.wrap(content));
+				return buffer.toString();
+			} catch (CharacterCodingException e) {
+				// ignore and advance to the next charset
+			} catch (IllegalCharsetNameException e) {
+				// ignore illegal charset names
+			} catch (UnsupportedCharsetException e) {
+				// ignore unsupported charsets
+			}
+		}
+		return new String(content, Charset.forName("UTF-8"));
+	}
 }
\ No newline at end of file
diff --git a/src/com/gitblit/wicket/pages/BlobPage.java b/src/com/gitblit/wicket/pages/BlobPage.java
index 1c43837..fb5a962 100644
--- a/src/com/gitblit/wicket/pages/BlobPage.java
+++ b/src/com/gitblit/wicket/pages/BlobPage.java
@@ -41,7 +41,8 @@
 
 		Repository r = getRepository();
 		final String blobPath = WicketUtils.getPath(params);
-
+		String [] encodings = GitBlit.getEncodings();
+		
 		if (StringUtils.isEmpty(blobPath)) {
 			// blob by objectid
 
@@ -54,7 +55,7 @@
 			add(new BookmarkablePageLink<Void>("headLink", BlobPage.class).setEnabled(false));
 			add(new CommitHeaderPanel("commitHeader", objectId));
 			add(new PathBreadcrumbsPanel("breadcrumbs", repositoryName, blobPath, objectId));
-			Component c = new Label("blobText", JGitUtils.getStringContent(r, objectId));
+			Component c = new Label("blobText", JGitUtils.getStringContent(r, objectId, encodings));
 			WicketUtils.setCssClass(c, "plainprint");
 			add(c);
 		} else {
@@ -111,7 +112,7 @@
 				case 1:
 					// PrettyPrint blob text
 					c = new Label("blobText", JGitUtils.getStringContent(r, commit.getTree(),
-							blobPath));
+							blobPath, encodings));
 					WicketUtils.setCssClass(c, "prettyprint linenums");
 					break;
 				case 2:
@@ -125,14 +126,14 @@
 				default:
 					// plain text
 					c = new Label("blobText", JGitUtils.getStringContent(r, commit.getTree(),
-							blobPath));
+							blobPath, encodings));
 					WicketUtils.setCssClass(c, "plainprint");
 				}
 				add(c);
 			} else {
 				// plain text
 				Label blobLabel = new Label("blobText", JGitUtils.getStringContent(r,
-						commit.getTree(), blobPath));
+						commit.getTree(), blobPath, encodings));
 				WicketUtils.setCssClass(blobLabel, "plainprint");
 				add(blobLabel);
 			}
diff --git a/src/com/gitblit/wicket/pages/MarkdownPage.java b/src/com/gitblit/wicket/pages/MarkdownPage.java
index aaf12ba..5764235 100644
--- a/src/com/gitblit/wicket/pages/MarkdownPage.java
+++ b/src/com/gitblit/wicket/pages/MarkdownPage.java
@@ -24,6 +24,7 @@
 import org.eclipse.jgit.lib.Repository;
 import org.eclipse.jgit.revwalk.RevCommit;
 
+import com.gitblit.GitBlit;
 import com.gitblit.utils.JGitUtils;
 import com.gitblit.utils.MarkdownUtils;
 import com.gitblit.wicket.WicketUtils;
@@ -37,7 +38,8 @@
 
 		Repository r = getRepository();
 		RevCommit commit = JGitUtils.getCommit(r, objectId);
-
+		String [] encodings = GitBlit.getEncodings();
+		
 		// markdown page links
 		add(new BookmarkablePageLink<Void>("blameLink", BlamePage.class,
 				WicketUtils.newPathParameter(repositoryName, objectId, markdownPath)));
@@ -49,7 +51,7 @@
 				WicketUtils.newPathParameter(repositoryName, Constants.HEAD, markdownPath)));
 
 		// Read raw markdown content and transform it to html
-		String markdownText = JGitUtils.getStringContent(r, commit.getTree(), markdownPath);
+		String markdownText = JGitUtils.getStringContent(r, commit.getTree(), markdownPath, encodings);
 		String htmlText;
 		try {
 			htmlText = MarkdownUtils.transformMarkdown(markdownText);
diff --git a/src/com/gitblit/wicket/pages/RawPage.java b/src/com/gitblit/wicket/pages/RawPage.java
index f71d986..00cc5bf 100644
--- a/src/com/gitblit/wicket/pages/RawPage.java
+++ b/src/com/gitblit/wicket/pages/RawPage.java
@@ -43,7 +43,8 @@
 		final String repositoryName = WicketUtils.getRepositoryName(params);
 		final String objectId = WicketUtils.getObject(params);
 		final String blobPath = WicketUtils.getPath(params);
-
+		String [] encodings = GitBlit.getEncodings();
+		
 		Repository r = GitBlit.self().getRepository(repositoryName);
 		if (r == null) {
 			error(getString("gb.canNotLoadRepository") + " " + repositoryName);
@@ -53,7 +54,7 @@
 
 		if (StringUtils.isEmpty(blobPath)) {
 			// objectid referenced raw view
-			Label blobLabel = new Label("rawText", JGitUtils.getStringContent(r, objectId));
+			Label blobLabel = new Label("rawText", JGitUtils.getStringContent(r, objectId, encodings));
 			WicketUtils.setCssClass(blobLabel, "plainprint");
 			add(blobLabel);
 		} else {
@@ -92,14 +93,14 @@
 				default:
 					// plain text
 					c = new Label("rawText", JGitUtils.getStringContent(r, commit.getTree(),
-							blobPath));
+							blobPath, encodings));
 					WicketUtils.setCssClass(c, "plainprint");
 				}
 				add(c);
 			} else {
 				// plain text
 				Label blobLabel = new Label("rawText", JGitUtils.getStringContent(r,
-						commit.getTree(), blobPath));
+						commit.getTree(), blobPath, encodings));
 				WicketUtils.setCssClass(blobLabel, "plainprint");
 				add(blobLabel);
 			}
diff --git a/src/com/gitblit/wicket/pages/SummaryPage.java b/src/com/gitblit/wicket/pages/SummaryPage.java
index 2996b66..8e145c8 100644
--- a/src/com/gitblit/wicket/pages/SummaryPage.java
+++ b/src/com/gitblit/wicket/pages/SummaryPage.java
@@ -158,7 +158,8 @@
 					}
 				}
 				if (!StringUtils.isEmpty(readme)) {
-					String markdownText = JGitUtils.getStringContent(r, head.getTree(), readme);
+					String [] encodings = GitBlit.getEncodings();
+					String markdownText = JGitUtils.getStringContent(r, head.getTree(), readme, encodings);
 					htmlText = MarkdownUtils.transformMarkdown(markdownText);
 				}
 			} catch (ParseException p) {
diff --git a/tests/com/gitblit/tests/JGitUtilsTest.java b/tests/com/gitblit/tests/JGitUtilsTest.java
index 616ea83..dc4d3c5 100644
--- a/tests/com/gitblit/tests/JGitUtilsTest.java
+++ b/tests/com/gitblit/tests/JGitUtilsTest.java
@@ -37,6 +37,7 @@
 import org.eclipse.jgit.lib.Repository;
 import org.eclipse.jgit.lib.RepositoryCache.FileKey;
 import org.eclipse.jgit.revwalk.RevCommit;
+import org.eclipse.jgit.revwalk.RevTree;
 import org.eclipse.jgit.util.FS;
 import org.eclipse.jgit.util.FileUtils;
 import org.junit.Test;
@@ -265,7 +266,7 @@
 	@Test
 	public void testStringContent() throws Exception {
 		Repository repository = GitBlitSuite.getHelloworldRepository();
-		String contentA = JGitUtils.getStringContent(repository, null, "java.java");
+		String contentA = JGitUtils.getStringContent(repository, (RevTree) null, "java.java");
 		RevCommit commit = JGitUtils.getCommit(repository, Constants.HEAD);
 		String contentB = JGitUtils.getStringContent(repository, commit.getTree(), "java.java");
 		String contentC = JGitUtils.getStringContent(repository, commit.getTree(), "missing.txt");

--
Gitblit v1.9.1