Source Code of com.gitblit.service.LuceneService$MultiSourceReader

/*
 * Copyright 2012 gitblit.com.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.gitblit.service;


import static org.eclipse.jgit.treewalk.filter.TreeFilter.ANY_DIFF;


import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.MessageFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.ObjectId;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectReader;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.lib.RepositoryCache.FileKey;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.revwalk.RevTree;
import org.eclipse.jgit.revwalk.RevWalk;
import org.eclipse.jgit.storage.file.FileBasedConfig;
import org.eclipse.jgit.treewalk.EmptyTreeIterator;
import org.eclipse.jgit.treewalk.TreeWalk;
import org.eclipse.jgit.util.FS;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.gitblit.Constants.SearchObjectType;
import com.gitblit.IStoredSettings;
import com.gitblit.Keys;
import com.gitblit.manager.IRepositoryManager;
import com.gitblit.models.PathModel.PathChangeModel;
import com.gitblit.models.RefModel;
import com.gitblit.models.RepositoryModel;
import com.gitblit.models.SearchResult;
import com.gitblit.utils.ArrayUtils;
import com.gitblit.utils.JGitUtils;
import com.gitblit.utils.StringUtils;


/**
 * The Lucene service handles indexing and searching repositories.
 *
 * @author James Moger
 *
 */
public class LuceneService implements Runnable {




  private static final int INDEX_VERSION = 5;


  private static final String FIELD_OBJECT_TYPE = "type";
  private static final String FIELD_PATH = "path";
  private static final String FIELD_COMMIT = "commit";
  private static final String FIELD_BRANCH = "branch";
  private static final String FIELD_SUMMARY = "summary";
  private static final String FIELD_CONTENT = "content";
  private static final String FIELD_AUTHOR = "author";
  private static final String FIELD_COMMITTER = "committer";
  private static final String FIELD_DATE = "date";
  private static final String FIELD_TAG = "tag";


  private static final String CONF_FILE = "lucene.conf";
  private static final String LUCENE_DIR = "lucene";
  private static final String CONF_INDEX = "index";
  private static final String CONF_VERSION = "version";
  private static final String CONF_ALIAS = "aliases";
  private static final String CONF_BRANCH = "branches";


  private static final Version LUCENE_VERSION = Version.LUCENE_46;


  private final Logger logger = LoggerFactory.getLogger(LuceneService.class);


  private final IStoredSettings storedSettings;
  private final IRepositoryManager repositoryManager;
  private final File repositoriesFolder;


  private final Map<String, IndexSearcher> searchers = new ConcurrentHashMap<String, IndexSearcher>();
  private final Map<String, IndexWriter> writers = new ConcurrentHashMap<String, IndexWriter>();


  private final String luceneIgnoreExtensions = "7z arc arj bin bmp dll doc docx exe gif gz jar jpg lib lzh odg odf odt pdf ppt png so swf xcf xls xlsx zip";
  private Set<String> excludedExtensions;


  public LuceneService(
      IStoredSettings settings,
      IRepositoryManager repositoryManager) {


    this.storedSettings = settings;
    this.repositoryManager = repositoryManager;
    this.repositoriesFolder = repositoryManager.getRepositoriesFolder();
    String exts = luceneIgnoreExtensions;
    if (settings != null) {
      exts = settings.getString(Keys.web.luceneIgnoreExtensions, exts);
    }
    excludedExtensions = new TreeSet<String>(StringUtils.getStringsFromValue(exts));
  }


  /**
   * Run is executed by the Gitblit executor service.  Because this is called
   * by an executor service, calls will queue - i.e. there can never be
   * concurrent execution of repository index updates.
   */
  @Override
  public void run() {
    if (!storedSettings.getBoolean(Keys.web.allowLuceneIndexing, true)) {
      // Lucene indexing is disabled
      return;
    }
    // reload the excluded extensions
    String exts = storedSettings.getString(Keys.web.luceneIgnoreExtensions, luceneIgnoreExtensions);
    excludedExtensions = new TreeSet<String>(StringUtils.getStringsFromValue(exts));


    if (repositoryManager.isCollectingGarbage()) {
      // busy collecting garbage, try again later
      return;
    }


    for (String repositoryName: repositoryManager.getRepositoryList()) {
      RepositoryModel model = repositoryManager.getRepositoryModel(repositoryName);
      if (model.hasCommits && !ArrayUtils.isEmpty(model.indexedBranches)) {
        Repository repository = repositoryManager.getRepository(model.name);
        if (repository == null) {
          if (repositoryManager.isCollectingGarbage(model.name)) {
            logger.info(MessageFormat.format("Skipping Lucene index of {0}, busy garbage collecting", repositoryName));
          }
          continue;
        }
        index(model, repository);
        repository.close();
        System.gc();
      }
    }
  }


  /**
   * Synchronously indexes a repository. This may build a complete index of a
   * repository or it may update an existing index.
   *
   * @param displayName
   *            the name of the repository
   * @param repository
   *            the repository object
   */
  private void index(RepositoryModel model, Repository repository) {
    try {
      if (shouldReindex(repository)) {
        // (re)build the entire index
        IndexResult result = reindex(model, repository);


        if (result.success) {
          if (result.commitCount > 0) {
            String msg = "Built {0} Lucene index from {1} commits and {2} files across {3} branches in {4} secs";
            logger.info(MessageFormat.format(msg, model.name, result.commitCount,
                result.blobCount, result.branchCount, result.duration()));
          }
        } else {
          String msg = "Could not build {0} Lucene index!";
          logger.error(MessageFormat.format(msg, model.name));
        }
      } else {
        // update the index with latest commits
        IndexResult result = updateIndex(model, repository);
        if (result.success) {
          if (result.commitCount > 0) {
            String msg = "Updated {0} Lucene index with {1} commits and {2} files across {3} branches in {4} secs";
            logger.info(MessageFormat.format(msg, model.name, result.commitCount,
                result.blobCount, result.branchCount, result.duration()));
          }
        } else {
          String msg = "Could not update {0} Lucene index!";
          logger.error(MessageFormat.format(msg, model.name));
        }
      }
    } catch (Throwable t) {
      logger.error(MessageFormat.format("Lucene indexing failure for {0}", model.name), t);
    }
  }


  /**
   * Close the writer/searcher objects for a repository.
   *
   * @param repositoryName
   */
  public synchronized void close(String repositoryName) {
    try {
      IndexSearcher searcher = searchers.remove(repositoryName);
      if (searcher != null) {
        searcher.getIndexReader().close();
      }
    } catch (Exception e) {
      logger.error("Failed to close index searcher for " + repositoryName, e);
    }


    try {
      IndexWriter writer = writers.remove(repositoryName);
      if (writer != null) {
        writer.close();
      }
    } catch (Exception e) {
      logger.error("Failed to close index writer for " + repositoryName, e);
    }
  }


  /**
   * Close all Lucene indexers.
   *
   */
  public synchronized void close() {
    // close all writers
    for (String writer : writers.keySet()) {
      try {
        writers.get(writer).close(true);
      } catch (Throwable t) {
        logger.error("Failed to close Lucene writer for " + writer, t);
      }
    }
    writers.clear();


    // close all searchers
    for (String searcher : searchers.keySet()) {
      try {
        searchers.get(searcher).getIndexReader().close();
      } catch (Throwable t) {
        logger.error("Failed to close Lucene searcher for " + searcher, t);
      }
    }
    searchers.clear();
  }




  /**
   * Deletes the Lucene index for the specified repository.
   *
   * @param repositoryName
   * @return true, if successful
   */
  public boolean deleteIndex(String repositoryName) {
    try {
      // close any open writer/searcher
      close(repositoryName);


      // delete the index folder
      File repositoryFolder = FileKey.resolve(new File(repositoriesFolder, repositoryName), FS.DETECTED);
      File luceneIndex = new File(repositoryFolder, LUCENE_DIR);
      if (luceneIndex.exists()) {
        org.eclipse.jgit.util.FileUtils.delete(luceneIndex,
            org.eclipse.jgit.util.FileUtils.RECURSIVE);
      }
      // delete the config file
      File luceneConfig = new File(repositoryFolder, CONF_FILE);
      if (luceneConfig.exists()) {
        luceneConfig.delete();
      }
      return true;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }


  /**
   * Returns the author for the commit, if this information is available.
   *
   * @param commit
   * @return an author or unknown
   */
  private String getAuthor(RevCommit commit) {
    String name = "unknown";
    try {
      name = commit.getAuthorIdent().getName();
      if (StringUtils.isEmpty(name)) {
        name = commit.getAuthorIdent().getEmailAddress();
      }
    } catch (NullPointerException n) {
    }
    return name;
  }


  /**
   * Returns the committer for the commit, if this information is available.
   *
   * @param commit
   * @return an committer or unknown
   */
  private String getCommitter(RevCommit commit) {
    String name = "unknown";
    try {
      name = commit.getCommitterIdent().getName();
      if (StringUtils.isEmpty(name)) {
        name = commit.getCommitterIdent().getEmailAddress();
      }
    } catch (NullPointerException n) {
    }
    return name;
  }


  /**
   * Get the tree associated with the given commit.
   *
   * @param walk
   * @param commit
   * @return tree
   * @throws IOException
   */
  private RevTree getTree(final RevWalk walk, final RevCommit commit)
      throws IOException {
    final RevTree tree = commit.getTree();
    if (tree != null) {
      return tree;
    }
    walk.parseHeaders(commit);
    return commit.getTree();
  }


  /**
   * Construct a keyname from the branch.
   *
   * @param branchName
   * @return a keyname appropriate for the Git config file format
   */
  private String getBranchKey(String branchName) {
    return StringUtils.getSHA1(branchName);
  }


  /**
   * Returns the Lucene configuration for the specified repository.
   *
   * @param repository
   * @return a config object
   */
  private FileBasedConfig getConfig(Repository repository) {
    File file = new File(repository.getDirectory(), CONF_FILE);
    FileBasedConfig config = new FileBasedConfig(file, FS.detect());
    return config;
  }


  /**
   * Reads the Lucene config file for the repository to check the index
   * version. If the index version is different, then rebuild the repository
   * index.
   *
   * @param repository
   * @return true of the on-disk index format is different than INDEX_VERSION
   */
  private boolean shouldReindex(Repository repository) {
    try {
      FileBasedConfig config = getConfig(repository);
      config.load();
      int indexVersion = config.getInt(CONF_INDEX, CONF_VERSION, 0);
      // reindex if versions do not match
      return indexVersion != INDEX_VERSION;
    } catch (Throwable t) {
    }
    return true;
  }




  /**
   * This completely indexes the repository and will destroy any existing
   * index.
   *
   * @param repositoryName
   * @param repository
   * @return IndexResult
   */
  public IndexResult reindex(RepositoryModel model, Repository repository) {
    IndexResult result = new IndexResult();
    if (!deleteIndex(model.name)) {
      return result;
    }
    try {
      String [] encodings = storedSettings.getStrings(Keys.web.blobEncodings).toArray(new String[0]);
      FileBasedConfig config = getConfig(repository);
      Set<String> indexedCommits = new TreeSet<String>();
      IndexWriter writer = getIndexWriter(model.name);
      // build a quick lookup of tags
      Map<String, List<String>> tags = new HashMap<String, List<String>>();
      for (RefModel tag : JGitUtils.getTags(repository, false, -1)) {
        if (!tag.isAnnotatedTag()) {
          // skip non-annotated tags
          continue;
        }
        if (!tags.containsKey(tag.getObjectId().getName())) {
          tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>());
        }
        tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName);
      }


      ObjectReader reader = repository.newObjectReader();


      // get the local branches
      List<RefModel> branches = JGitUtils.getLocalBranches(repository, true, -1);


      // sort them by most recently updated
      Collections.sort(branches, new Comparator<RefModel>() {
        @Override
        public int compare(RefModel ref1, RefModel ref2) {
          return ref2.getDate().compareTo(ref1.getDate());
        }
      });


      // reorder default branch to first position
      RefModel defaultBranch = null;
      ObjectId defaultBranchId = JGitUtils.getDefaultBranch(repository);
      for (RefModel branch :  branches) {
        if (branch.getObjectId().equals(defaultBranchId)) {
          defaultBranch = branch;
          break;
        }
      }
      branches.remove(defaultBranch);
      branches.add(0, defaultBranch);


      // walk through each branch
      for (RefModel branch : branches) {


        boolean indexBranch = false;
        if (model.indexedBranches.contains(com.gitblit.Constants.DEFAULT_BRANCH)
            && branch.equals(defaultBranch)) {
          // indexing "default" branch
          indexBranch = true;
        } else if (branch.getName().startsWith(com.gitblit.Constants.R_META)) {
          // skip internal meta branches
          indexBranch = false;
        } else {
          // normal explicit branch check
          indexBranch = model.indexedBranches.contains(branch.getName());
        }


        // if this branch is not specifically indexed then skip
        if (!indexBranch) {
          continue;
        }


        String branchName = branch.getName();
        RevWalk revWalk = new RevWalk(reader);
        RevCommit tip = revWalk.parseCommit(branch.getObjectId());
        String tipId = tip.getId().getName();


        String keyName = getBranchKey(branchName);
        config.setString(CONF_ALIAS, null, keyName, branchName);
        config.setString(CONF_BRANCH, null, keyName, tipId);


        // index the blob contents of the tree
        TreeWalk treeWalk = new TreeWalk(repository);
        treeWalk.addTree(tip.getTree());
        treeWalk.setRecursive(true);


        Map<String, ObjectId> paths = new TreeMap<String, ObjectId>();
        while (treeWalk.next()) {
          // ensure path is not in a submodule
          if (treeWalk.getFileMode(0) != FileMode.GITLINK) {
            paths.put(treeWalk.getPathString(), treeWalk.getObjectId(0));
          }
        }


        ByteArrayOutputStream os = new ByteArrayOutputStream();
        byte[] tmp = new byte[32767];


        RevWalk commitWalk = new RevWalk(reader);
        commitWalk.markStart(tip);


        RevCommit commit;
        while ((paths.size() > 0) && (commit = commitWalk.next()) != null) {
          TreeWalk diffWalk = new TreeWalk(reader);
          int parentCount = commit.getParentCount();
          switch (parentCount) {
          case 0:
            diffWalk.addTree(new EmptyTreeIterator());
            break;
          case 1:
            diffWalk.addTree(getTree(commitWalk, commit.getParent(0)));
            break;
          default:
            // skip merge commits
            continue;
          }
          diffWalk.addTree(getTree(commitWalk, commit));
          diffWalk.setFilter(ANY_DIFF);
          diffWalk.setRecursive(true);
          while ((paths.size() > 0) && diffWalk.next()) {
            String path = diffWalk.getPathString();
            if (!paths.containsKey(path)) {
              continue;
            }


            // remove path from set
            ObjectId blobId = paths.remove(path);
            result.blobCount++;


            // index the blob metadata
            String blobAuthor = getAuthor(commit);
            String blobCommitter = getCommitter(commit);
            String blobDate = DateTools.timeToString(commit.getCommitTime() * 1000L,
                Resolution.MINUTE);


            Document doc = new Document();
            doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED));
            doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
            doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED));
            doc.add(new Field(FIELD_PATH, path, TextField.TYPE_STORED));
            doc.add(new Field(FIELD_DATE, blobDate, StringField.TYPE_STORED));
            doc.add(new Field(FIELD_AUTHOR, blobAuthor, TextField.TYPE_STORED));
            doc.add(new Field(FIELD_COMMITTER, blobCommitter, TextField.TYPE_STORED));


            // determine extension to compare to the extension
            // blacklist
            String ext = null;
            String name = path.toLowerCase();
            if (name.indexOf('.') > -1) {
              ext = name.substring(name.lastIndexOf('.') + 1);
            }


            // index the blob content
            if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
              ObjectLoader ldr = repository.open(blobId, Constants.OBJ_BLOB);
              InputStream in = ldr.openStream();
              int n;
              while ((n = in.read(tmp)) > 0) {
                os.write(tmp, 0, n);
              }
              in.close();
              byte[] content = os.toByteArray();
              String str = StringUtils.decodeString(content, encodings);
              doc.add(new Field(FIELD_CONTENT, str, TextField.TYPE_STORED));
              os.reset();
            }


            // add the blob to the index
            writer.addDocument(doc);
          }
        }


        os.close();


        // index the tip commit object
        if (indexedCommits.add(tipId)) {
          Document doc = createDocument(tip, tags.get(tipId));
          doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
          writer.addDocument(doc);
          result.commitCount += 1;
          result.branchCount += 1;
        }


        // traverse the log and index the previous commit objects
        RevWalk historyWalk = new RevWalk(reader);
        historyWalk.markStart(historyWalk.parseCommit(tip.getId()));
        RevCommit rev;
        while ((rev = historyWalk.next()) != null) {
          String hash = rev.getId().getName();
          if (indexedCommits.add(hash)) {
            Document doc = createDocument(rev, tags.get(hash));
            doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
            writer.addDocument(doc);
            result.commitCount += 1;
          }
        }
      }


      // finished
      reader.release();


      // commit all changes and reset the searcher
      config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION);
      config.save();
      writer.commit();
      resetIndexSearcher(model.name);
      result.success();
    } catch (Exception e) {
      logger.error("Exception while reindexing " + model.name, e);
    }
    return result;
  }


  /**
   * Incrementally update the index with the specified commit for the
   * repository.
   *
   * @param repositoryName
   * @param repository
   * @param branch
   *            the fully qualified branch name (e.g. refs/heads/master)
   * @param commit
   * @return true, if successful
   */
  private IndexResult index(String repositoryName, Repository repository,
      String branch, RevCommit commit) {
    IndexResult result = new IndexResult();
    try {
      String [] encodings = storedSettings.getStrings(Keys.web.blobEncodings).toArray(new String[0]);
      List<PathChangeModel> changedPaths = JGitUtils.getFilesInCommit(repository, commit);
      String revDate = DateTools.timeToString(commit.getCommitTime() * 1000L,
          Resolution.MINUTE);
      IndexWriter writer = getIndexWriter(repositoryName);
      for (PathChangeModel path : changedPaths) {
        if (path.isSubmodule()) {
          continue;
        }
        // delete the indexed blob
        deleteBlob(repositoryName, branch, path.name);


        // re-index the blob
        if (!ChangeType.DELETE.equals(path.changeType)) {
          result.blobCount++;
          Document doc = new Document();
          doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED));
          doc.add(new Field(FIELD_BRANCH, branch, TextField.TYPE_STORED));
          doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED));
          doc.add(new Field(FIELD_PATH, path.path, TextField.TYPE_STORED));
          doc.add(new Field(FIELD_DATE, revDate, StringField.TYPE_STORED));
          doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), TextField.TYPE_STORED));
          doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), TextField.TYPE_STORED));


          // determine extension to compare to the extension
          // blacklist
          String ext = null;
          String name = path.name.toLowerCase();
          if (name.indexOf('.') > -1) {
            ext = name.substring(name.lastIndexOf('.') + 1);
          }


          if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
            // read the blob content
            String str = JGitUtils.getStringContent(repository, commit.getTree(),
                path.path, encodings);
            if (str != null) {
              doc.add(new Field(FIELD_CONTENT, str, TextField.TYPE_STORED));
              writer.addDocument(doc);
            }
          }
        }
      }
      writer.commit();


      // get any annotated commit tags
      List<String> commitTags = new ArrayList<String>();
      for (RefModel ref : JGitUtils.getTags(repository, false, -1)) {
        if (ref.isAnnotatedTag() && ref.getReferencedObjectId().equals(commit.getId())) {
          commitTags.add(ref.displayName);
        }
      }


      // create and write the Lucene document
      Document doc = createDocument(commit, commitTags);
      doc.add(new Field(FIELD_BRANCH, branch, TextField.TYPE_STORED));
      result.commitCount++;
      result.success = index(repositoryName, doc);
    } catch (Exception e) {
      logger.error(MessageFormat.format("Exception while indexing commit {0} in {1}", commit.getId().getName(), repositoryName), e);
    }
    return result;
  }


  /**
   * Delete a blob from the specified branch of the repository index.
   *
   * @param repositoryName
   * @param branch
   * @param path
   * @throws Exception
   * @return true, if deleted, false if no record was deleted
   */
  public boolean deleteBlob(String repositoryName, String branch, String path) throws Exception {
    String pattern = MessageFormat.format("{0}:'{'0} AND {1}:\"'{'1'}'\" AND {2}:\"'{'2'}'\"", FIELD_OBJECT_TYPE, FIELD_BRANCH, FIELD_PATH);
    String q = MessageFormat.format(pattern, SearchObjectType.blob.name(), branch, path);


    BooleanQuery query = new BooleanQuery();
    StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
    QueryParser qp = new QueryParser(LUCENE_VERSION, FIELD_SUMMARY, analyzer);
    query.add(qp.parse(q), Occur.MUST);


    IndexWriter writer = getIndexWriter(repositoryName);
    int numDocsBefore = writer.numDocs();
    writer.deleteDocuments(query);
    writer.commit();
    int numDocsAfter = writer.numDocs();
    if (numDocsBefore == numDocsAfter) {
      logger.debug(MessageFormat.format("no records found to delete {0}", query.toString()));
      return false;
    } else {
      logger.debug(MessageFormat.format("deleted {0} records with {1}", numDocsBefore - numDocsAfter, query.toString()));
      return true;
    }
  }


  /**
   * Updates a repository index incrementally from the last indexed commits.
   *
   * @param model
   * @param repository
   * @return IndexResult
   */
  private IndexResult updateIndex(RepositoryModel model, Repository repository) {
    IndexResult result = new IndexResult();
    try {
      FileBasedConfig config = getConfig(repository);
      config.load();


      // build a quick lookup of annotated tags
      Map<String, List<String>> tags = new HashMap<String, List<String>>();
      for (RefModel tag : JGitUtils.getTags(repository, false, -1)) {
        if (!tag.isAnnotatedTag()) {
          // skip non-annotated tags
          continue;
        }
        if (!tags.containsKey(tag.getObjectId().getName())) {
          tags.put(tag.getReferencedObjectId().getName(), new ArrayList<String>());
        }
        tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName);
      }


      // detect branch deletion
      // first assume all branches are deleted and then remove each
      // existing branch from deletedBranches during indexing
      Set<String> deletedBranches = new TreeSet<String>();
      for (String alias : config.getNames(CONF_ALIAS)) {
        String branch = config.getString(CONF_ALIAS, null, alias);
        deletedBranches.add(branch);
      }


      // get the local branches
      List<RefModel> branches = JGitUtils.getLocalBranches(repository, true, -1);


      // sort them by most recently updated
      Collections.sort(branches, new Comparator<RefModel>() {
        @Override
        public int compare(RefModel ref1, RefModel ref2) {
          return ref2.getDate().compareTo(ref1.getDate());
        }
      });


      // reorder default branch to first position
      RefModel defaultBranch = null;
      ObjectId defaultBranchId = JGitUtils.getDefaultBranch(repository);
      for (RefModel branch :  branches) {
        if (branch.getObjectId().equals(defaultBranchId)) {
          defaultBranch = branch;
          break;
        }
      }
      branches.remove(defaultBranch);
      branches.add(0, defaultBranch);


      // walk through each branches
      for (RefModel branch : branches) {
        String branchName = branch.getName();


        boolean indexBranch = false;
        if (model.indexedBranches.contains(com.gitblit.Constants.DEFAULT_BRANCH)
            && branch.equals(defaultBranch)) {
          // indexing "default" branch
          indexBranch = true;
        } else if (branch.getName().startsWith(com.gitblit.Constants.R_META)) {
          // ignore internal meta branches
          indexBranch = false;
        } else {
          // normal explicit branch check
          indexBranch = model.indexedBranches.contains(branch.getName());
        }


        // if this branch is not specifically indexed then skip
        if (!indexBranch) {
          continue;
        }


        // remove this branch from the deletedBranches set
        deletedBranches.remove(branchName);


        // determine last commit
        String keyName = getBranchKey(branchName);
        String lastCommit = config.getString(CONF_BRANCH, null, keyName);


        List<RevCommit> revs;
        if (StringUtils.isEmpty(lastCommit)) {
          // new branch/unindexed branch, get all commits on branch
          revs = JGitUtils.getRevLog(repository, branchName, 0, -1);
        } else {
          // pre-existing branch, get changes since last commit
          revs = JGitUtils.getRevLog(repository, lastCommit, branchName);
        }


        if (revs.size() > 0) {
          result.branchCount += 1;
        }


        // reverse the list of commits so we start with the first commit
        Collections.reverse(revs);
        for (RevCommit commit : revs) {
          // index a commit
          result.add(index(model.name, repository, branchName, commit));
        }


        // update the config
        config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION);
        config.setString(CONF_ALIAS, null, keyName, branchName);
        config.setString(CONF_BRANCH, null, keyName, branch.getObjectId().getName());
        config.save();
      }


      // the deletedBranches set will normally be empty by this point
      // unless a branch really was deleted and no longer exists
      if (deletedBranches.size() > 0) {
        for (String branch : deletedBranches) {
          IndexWriter writer = getIndexWriter(model.name);
          writer.deleteDocuments(new Term(FIELD_BRANCH, branch));
          writer.commit();
        }
      }
      result.success = true;
    } catch (Throwable t) {
      logger.error(MessageFormat.format("Exception while updating {0} Lucene index", model.name), t);
    }
    return result;
  }


  /**
   * Creates a Lucene document for a commit
   *
   * @param commit
   * @param tags
   * @return a Lucene document
   */
  private Document createDocument(RevCommit commit, List<String> tags) {
    Document doc = new Document();
    doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.commit.name(), StringField.TYPE_STORED));
    doc.add(new Field(FIELD_COMMIT, commit.getName(), TextField.TYPE_STORED));
    doc.add(new Field(FIELD_DATE, DateTools.timeToString(commit.getCommitTime() * 1000L,
        Resolution.MINUTE), StringField.TYPE_STORED));
    doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), TextField.TYPE_STORED));
    doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), TextField.TYPE_STORED));
    doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), TextField.TYPE_STORED));
    doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), TextField.TYPE_STORED));
    if (!ArrayUtils.isEmpty(tags)) {
      doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), TextField.TYPE_STORED));
    }
    return doc;
  }


  /**
   * Incrementally index an object for the repository.
   *
   * @param repositoryName
   * @param doc
   * @return true, if successful
   */
  private boolean index(String repositoryName, Document doc) {
    try {
      IndexWriter writer = getIndexWriter(repositoryName);
      writer.addDocument(doc);
      writer.commit();
      resetIndexSearcher(repositoryName);
      return true;
    } catch (Exception e) {
      logger.error(MessageFormat.format("Exception while incrementally updating {0} Lucene index", repositoryName), e);
    }
    return false;
  }


  private SearchResult createSearchResult(Document doc, float score, int hitId, int totalHits) throws ParseException {
    SearchResult result = new SearchResult();
    result.hitId = hitId;
    result.totalHits = totalHits;
    result.score = score;
    result.date = DateTools.stringToDate(doc.get(FIELD_DATE));
    result.summary = doc.get(FIELD_SUMMARY);
    result.author = doc.get(FIELD_AUTHOR);
    result.committer = doc.get(FIELD_COMMITTER);
    result.type = SearchObjectType.fromName(doc.get(FIELD_OBJECT_TYPE));
    result.branch = doc.get(FIELD_BRANCH);
    result.commitId = doc.get(FIELD_COMMIT);
    result.path = doc.get(FIELD_PATH);
    if (doc.get(FIELD_TAG) != null) {
      result.tags = StringUtils.getStringsFromValue(doc.get(FIELD_TAG));
    }
    return result;
  }


  private synchronized void resetIndexSearcher(String repository) throws IOException {
    IndexSearcher searcher = searchers.remove(repository);
    if (searcher != null) {
      searcher.getIndexReader().close();
    }
  }


  /**
   * Gets an index searcher for the repository.
   *
   * @param repository
   * @return
   * @throws IOException
   */
  private IndexSearcher getIndexSearcher(String repository) throws IOException {
    IndexSearcher searcher = searchers.get(repository);
    if (searcher == null) {
      IndexWriter writer = getIndexWriter(repository);
      searcher = new IndexSearcher(DirectoryReader.open(writer, true));
      searchers.put(repository, searcher);
    }
    return searcher;
  }


  /**
   * Gets an index writer for the repository. The index will be created if it
   * does not already exist or if forceCreate is specified.
   *
   * @param repository
   * @return an IndexWriter
   * @throws IOException
   */
  private IndexWriter getIndexWriter(String repository) throws IOException {
    IndexWriter indexWriter = writers.get(repository);
    File repositoryFolder = FileKey.resolve(new File(repositoriesFolder, repository), FS.DETECTED);
    File indexFolder = new File(repositoryFolder, LUCENE_DIR);
    Directory directory = FSDirectory.open(indexFolder);


    if (indexWriter == null) {
      if (!indexFolder.exists()) {
        indexFolder.mkdirs();
      }
      StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
      IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer);
      config.setOpenMode(OpenMode.CREATE_OR_APPEND);
      indexWriter = new IndexWriter(directory, config);
      writers.put(repository, indexWriter);
    }
    return indexWriter;
  }


  /**
   * Searches the specified repositories for the given text or query
   *
   * @param text
   *            if the text is null or empty, null is returned
   * @param page
   *            the page number to retrieve. page is 1-indexed.
   * @param pageSize
   *            the number of elements to return for this page
   * @param repositories
   *            a list of repositories to search. if no repositories are
   *            specified null is returned.
   * @return a list of SearchResults in order from highest to the lowest score
   *
   */
  public List<SearchResult> search(String text, int page, int pageSize, List<String> repositories) {
    if (ArrayUtils.isEmpty(repositories)) {
      return null;
    }
    return search(text, page, pageSize, repositories.toArray(new String[0]));
  }


  /**
   * Searches the specified repositories for the given text or query
   *
   * @param text
   *            if the text is null or empty, null is returned
   * @param page
   *            the page number to retrieve. page is 1-indexed.
   * @param pageSize
   *            the number of elements to return for this page
   * @param repositories
   *            a list of repositories to search. if no repositories are
   *            specified null is returned.
   * @return a list of SearchResults in order from highest to the lowest score
   *
   */
  public List<SearchResult> search(String text, int page, int pageSize, String... repositories) {
    if (StringUtils.isEmpty(text)) {
      return null;
    }
    if (ArrayUtils.isEmpty(repositories)) {
      return null;
    }
    Set<SearchResult> results = new LinkedHashSet<SearchResult>();
    StandardAnalyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
    try {
      // default search checks summary and content
      BooleanQuery query = new BooleanQuery();
      QueryParser qp;
      qp = new QueryParser(LUCENE_VERSION, FIELD_SUMMARY, analyzer);
      qp.setAllowLeadingWildcard(true);
      query.add(qp.parse(text), Occur.SHOULD);


      qp = new QueryParser(LUCENE_VERSION, FIELD_CONTENT, analyzer);
      qp.setAllowLeadingWildcard(true);
      query.add(qp.parse(text), Occur.SHOULD);


      IndexSearcher searcher;
      if (repositories.length == 1) {
        // single repository search
        searcher = getIndexSearcher(repositories[0]);
      } else {
        // multiple repository search
        List<IndexReader> readers = new ArrayList<IndexReader>();
        for (String repository : repositories) {
          IndexSearcher repositoryIndex = getIndexSearcher(repository);
          readers.add(repositoryIndex.getIndexReader());
        }
        IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]);
        MultiSourceReader reader = new MultiSourceReader(rdrs);
        searcher = new IndexSearcher(reader);
      }


      Query rewrittenQuery = searcher.rewrite(query);
      logger.debug(rewrittenQuery.toString());


      TopScoreDocCollector collector = TopScoreDocCollector.create(5000, true);
      searcher.search(rewrittenQuery, collector);
      int offset = Math.max(0, (page - 1) * pageSize);
      ScoreDoc[] hits = collector.topDocs(offset, pageSize).scoreDocs;
      int totalHits = collector.getTotalHits();
      for (int i = 0; i < hits.length; i++) {
        int docId = hits[i].doc;
        Document doc = searcher.doc(docId);
        SearchResult result = createSearchResult(doc, hits[i].score, offset + i + 1, totalHits);
        if (repositories.length == 1) {
          // single repository search
          result.repository = repositories[0];
        } else {
          // multi-repository search
          MultiSourceReader reader = (MultiSourceReader) searcher.getIndexReader();
          int index = reader.getSourceIndex(docId);
          result.repository = repositories[index];
        }
        String content = doc.get(FIELD_CONTENT);
        result.fragment = getHighlightedFragment(analyzer, query, content, result);
        results.add(result);
      }
    } catch (Exception e) {
      logger.error(MessageFormat.format("Exception while searching for {0}", text), e);
    }
    return new ArrayList<SearchResult>(results);
  }


  /**
   *
   * @param analyzer
   * @param query
   * @param content
   * @param result
   * @return
   * @throws IOException
   * @throws InvalidTokenOffsetsException
   */
  private String getHighlightedFragment(Analyzer analyzer, Query query,
      String content, SearchResult result) throws IOException, InvalidTokenOffsetsException {
    if (content == null) {
      content = "";
    }


    int fragmentLength = SearchObjectType.commit == result.type ? 512 : 150;


    QueryScorer scorer = new QueryScorer(query, "content");
    Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentLength);


    // use an artificial delimiter for the token
    String termTag = "!!--[";
    String termTagEnd = "]--!!";
    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(termTag, termTagEnd);
    Highlighter highlighter = new Highlighter(formatter, scorer);
    highlighter.setTextFragmenter(fragmenter);


    String [] fragments = highlighter.getBestFragments(analyzer, "content", content, 3);
    if (ArrayUtils.isEmpty(fragments)) {
      if (SearchObjectType.blob  == result.type) {
        return "";
      }
      // clip commit message
      String fragment = content;
      if (fragment.length() > fragmentLength) {
        fragment = fragment.substring(0, fragmentLength) + "...";
      }
      return "<pre class=\"text\">" + StringUtils.escapeForHtml(fragment, true) + "</pre>";
    }


    // make sure we have unique fragments
    Set<String> uniqueFragments = new LinkedHashSet<String>();
    for (String fragment : fragments) {
      uniqueFragments.add(fragment);
    }
    fragments = uniqueFragments.toArray(new String[uniqueFragments.size()]);


    StringBuilder sb = new StringBuilder();
    for (int i = 0, len = fragments.length; i < len; i++) {
      String fragment = fragments[i];
      String tag = "<pre class=\"text\">";


      // resurrect the raw fragment from removing the artificial delimiters
      String raw = fragment.replace(termTag, "").replace(termTagEnd, "");


      // determine position of the raw fragment in the content
      int pos = content.indexOf(raw);


      // restore complete first line of fragment
      int c = pos;
      while (c > 0) {
        c--;
        if (content.charAt(c) == '\n') {
          break;
        }
      }
      if (c > 0) {
        // inject leading chunk of first fragment line
        fragment = content.substring(c + 1, pos) + fragment;
      }


      if (SearchObjectType.blob  == result.type) {
        // count lines as offset into the content for this fragment
        int line = Math.max(1, StringUtils.countLines(content.substring(0, pos)));


        // create fragment tag with line number and language
        String lang = "";
        String ext = StringUtils.getFileExtension(result.path).toLowerCase();
        if (!StringUtils.isEmpty(ext)) {
          // maintain leading space!
          lang = " lang-" + ext;
        }
        tag = MessageFormat.format("<pre class=\"prettyprint linenums:{0,number,0}{1}\">", line, lang);


      }


      sb.append(tag);


      // replace the artificial delimiter with html tags
      String html = StringUtils.escapeForHtml(fragment, false);
      html = html.replace(termTag, "<span class=\"highlight\">").replace(termTagEnd, "</span>");
      sb.append(html);
      sb.append("</pre>");
      if (i < len - 1) {
        sb.append("<span class=\"ellipses\">...</span><br/>");
      }
    }
    return sb.toString();
  }


  /**
   * Simple class to track the results of an index update.
   */
  private class IndexResult {
    long startTime = System.currentTimeMillis();
    long endTime = startTime;
    boolean success;
    int branchCount;
    int commitCount;
    int blobCount;


    void add(IndexResult result) {
      this.branchCount += result.branchCount;
      this.commitCount += result.commitCount;
      this.blobCount += result.blobCount;
    }


    void success() {
      success = true;
      endTime = System.currentTimeMillis();
    }


    float duration() {
      return (endTime - startTime)/1000f;
    }
  }


  /**
   * Custom subclass of MultiReader to identify the source index for a given
   * doc id.  This would not be necessary of there was a public method to
   * obtain this information.
   *
   */
  private class MultiSourceReader extends MultiReader {


    MultiSourceReader(IndexReader [] readers) {
      super(readers, false);
    }


    int getSourceIndex(int docId) {
      int index = -1;
      try {
        index = super.readerIndex(docId);
      } catch (Exception e) {
        logger.error("Error getting source index", e);
      }
      return index;
    }
  }
}
Source Code of com.gitblit.service.LuceneService$MultiSourceReader

Related Classes of com.gitblit.service.LuceneService$MultiSourceReader