Source Code of com.google.enterprise.connector.sharepoint.spiimpl.SharepointTraversalManager

// Copyright 2007 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


package com.google.enterprise.connector.sharepoint.spiimpl;


import com.google.enterprise.connector.adgroups.AdGroupsTraversalManager;
import com.google.enterprise.connector.sharepoint.client.SharepointClient;
import com.google.enterprise.connector.sharepoint.client.SharepointClientContext;
import com.google.enterprise.connector.sharepoint.social.SharepointSocialUserProfileDocumentList;
import com.google.enterprise.connector.sharepoint.social.SharepointSocialTraversalManager;
import com.google.enterprise.connector.sharepoint.spiimpl.SharepointConnector.SocialOption;
import com.google.enterprise.connector.sharepoint.state.GlobalState;
import com.google.enterprise.connector.sharepoint.state.GlobalState.CrawlState;
import com.google.enterprise.connector.sharepoint.state.ListState;
import com.google.enterprise.connector.sharepoint.state.WebState;
import com.google.enterprise.connector.sharepoint.wsclient.client.ClientFactory;
import com.google.enterprise.connector.spi.DocumentList;
import com.google.enterprise.connector.spi.RepositoryException;
import com.google.enterprise.connector.spi.TraversalContext;
import com.google.enterprise.connector.spi.TraversalContextAware;
import com.google.enterprise.connector.spi.TraversalManager;


import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * This class is an implementation of the TraversalManager from the spi. All the
 * traversal based logic is invoked through this class.
 *
 * @author amit_kagrawal
 */


public class SharepointTraversalManager implements TraversalManager,
    TraversalContextAware {
  private final Logger LOGGER = Logger
      .getLogger(SharepointTraversalManager.class.getName());
  private final ClientFactory clientFactory;
  private SharepointClientContext sharepointClientContext;
  private SharepointClientContext sharepointClientContextOriginal = null;
  private GlobalState globalState;
  private int hint = -1;


  // The traversal context instance
  private TraversalContext traversalContext;
  private SharepointSocialTraversalManager socialTraversal;
  private AdGroupsTraversalManager adGroupsTraversal;


  /**
   * constructor.
   *
   * @param inConnector
   *          The instance of SharePoint connector for which traversal is to be
   *          done
   * @param inSharepointClientContext
   *          The context attached with the connector instances
   * @throws RepositoryException
   */
  public SharepointTraversalManager(final SharepointConnector inConnector,
      final SharepointClientContext inSharepointClientContext)
      throws RepositoryException {
    this(inConnector, inSharepointClientContext, null, null);
  }


  /**
   * constructor.
   *
   * @param inConnector
   *          The instance of SharePoint connector for which traversal is to be
   *          done
   * @param inSharepointClientContext
   *          The context attached with the connector instances
   * @param inSocialTraversal
   *          inner social connection traversal manager to encapsulate
   * @throws RepositoryException
   */
  public SharepointTraversalManager(final SharepointConnector inConnector,
      final SharepointClientContext inSharepointClientContext,
      SharepointSocialTraversalManager inSocialTraversal,
      AdGroupsTraversalManager inAdGroupsTraversal) 
      throws RepositoryException {
    if (inConnector == null) {
      throw new SharepointException(
          "Cannot initialize traversal manager because SharePointConnector object is null.");
    }
    if (inSharepointClientContext == null) {
      throw new SharepointException(
          "Cannot initialize traversal manager because SharePointClientContext object is null.");
    }
    clientFactory = inConnector.getClientFactory();
    try {
      socialTraversal = inSocialTraversal;
      adGroupsTraversal = inAdGroupsTraversal;
      LOGGER.config("SharepointTraversalManager: "
          + inSharepointClientContext.getSiteURL() + ", "
          + inSharepointClientContext.getGoogleConnectorWorkDir());
      sharepointClientContext = inSharepointClientContext;
      sharepointClientContextOriginal = (SharepointClientContext) inSharepointClientContext
          .clone();
      globalState = new GlobalState(clientFactory,
          inSharepointClientContext.getGoogleConnectorWorkDir(),
          inSharepointClientContext.getFeedType());
      globalState.loadState();
    } catch (final Exception e) {
      LOGGER.log(Level.WARNING, e.getMessage());
      throw new SharepointException(e);
    }
    LOGGER
        .info("SharepointTraversalManager(SharepointConnector inConnector,SharepointClientContext inSharepointClientContext)");
  }


  /**
   * Starts the traversal from a checkpoint specified by CM. The connector has
   * returned this checkpoint information to the CM at the completion of last
   * traversal. Though, SharePoint Connector does not really make use of this
   * checkpoint information for resuming the traversal. Instead, it uses the
   * state file for this purpose. State file implementation is specific to the
   * connector and CM is unaware of this.
   *
   * @param checkpoint
   *          Not really used by the SharePoint connector
   */
  public DocumentList resumeTraversal(final String checkpoint)
      throws RepositoryException {
    LOGGER.info("resumeTraversal, checkpoint received: " + checkpoint);
    DocumentList rsSocial = null;
    boolean docCheckpoint = true; // is this a user profile checkpoint or a doc checkpoint
    if (sharepointClientContext.getSocialOption() != SocialOption.NO) {
      if (checkpoint.startsWith(SharepointSocialUserProfileDocumentList.CHECKPOINT_PREFIX)) {
        rsSocial = doUserprofileCrawl(checkpoint);
        docCheckpoint = false;
      }
    }
    if (docCheckpoint) { // we are resuming a doc feed
      if (adGroupsTraversal != null) {
        adGroupsTraversal.resumeTraversal(checkpoint);
      }
      return resumeDocTraversal(checkpoint);
    } else if ((rsSocial == null) && (sharepointClientContext.getSocialOption() 
        != SocialOption.ONLY)) { // we want doc feed and social feed is complete
      return startDocTraversal();
    } else {
      return rsSocial;
    }
  }


  private DocumentList resumeDocTraversal(final String checkpoint) throws RepositoryException {
    LOGGER.info("resuming document traversal");
    // If feed type has been changed after the last traversal cycle. Let's
    // start a full recrawl
    if ((globalState.getFeedType() == null)
        || !globalState.getFeedType().equals(
            sharepointClientContext.getFeedType())) {
      LOGGER.log(Level.INFO, "feedType updated. initiating a full recrawl. ");
      return startDocTraversal();
    } else {
      sharepointClientContext.setInitialTraversal(false);
      return doTraversal();
    }
  }
  /**
   * Sets the batch hint which declares a threashold on the number of documents
   * that should be sent per traversal
   *
   * @see com.google.enterprise.connector.spi.TraversalManager
   *      #setBatchHint(int)
   */
  public void setBatchHint(final int hintNew) throws RepositoryException {
    hint = hintNew;
    if (socialTraversal != null) {
      socialTraversal.setBatchHint(hintNew);
    }
    LOGGER.info("BatchHint Set to [ " + hintNew + " ] ");
  }


  private DocumentList doUserprofileCrawl(String checkPoint) {
    DocumentList rsSocial;
    if (socialTraversal != null) {
      try {
        if ((checkPoint == null) || (checkPoint.equals(""))) {
          rsSocial = this.socialTraversal.startTraversal();
        } else {
          rsSocial = this.socialTraversal.resumeTraversal(checkPoint);
        }
      } catch (RepositoryException e) {
        boolean continuing =
            sharepointClientContext.getSocialOption() != SocialOption.ONLY;
        String message = "Failed getting user profiles"
            + (continuing ? ", continuing with the sites" : "");
        LOGGER.log(Level.SEVERE, message, e);
        rsSocial = null;
      }
    } else {
      LOGGER.info("SocialTraversalManger is null");
      rsSocial = null;
    }
    return rsSocial;
  }


  /**
   * To start a full crawl. Ignoring any checkpoint information.
   *
   * @see com.google.enterprise.connector.spi.TraversalManager #startTraversal()
   */
  public DocumentList startTraversal() throws RepositoryException {
    LOGGER.info("startTraversal()");
    DocumentList rsSocial = null;
    if (sharepointClientContext.getSocialOption() != SocialOption.NO) {
      rsSocial = doUserprofileCrawl("");
    }
    if (sharepointClientContext.getSocialOption() == SocialOption.ONLY)
      return rsSocial;


    // if there is no social traversal to be done or social traversal has
    // finished then do doc traversal
    if ((socialTraversal == null) || (rsSocial == null)) {
      if (adGroupsTraversal != null) {
        adGroupsTraversal.startTraversal();
      }
      return startDocTraversal();
    } else {
      return rsSocial;
    }
  }


  private void initializeGlobalStateForDocTraversal() {
    globalState = null;
    final String workDir = sharepointClientContext.getGoogleConnectorWorkDir();
    // delete the global state.. to simulate full crawl
    GlobalState.forgetState(workDir);
    sharepointClientContext.clearExcludedURLLogs();
    sharepointClientContext.setInitialTraversal(true);
    globalState = new GlobalState(clientFactory,
        sharepointClientContext.getGoogleConnectorWorkDir(),
        sharepointClientContext.getFeedType());
    globalState.setCrawlState(CrawlState.DOC_FEED);
  }


  public DocumentList startDocTraversal() throws RepositoryException {
    LOGGER.info("startDocTraversal");
    initializeGlobalStateForDocTraversal();
    return doTraversal();
  }


  private DocumentList doTraversal() throws RepositoryException {
    LOGGER.config("doTraversal()");


    if (hint == -1) {
      LOGGER.severe("Batch hint is -1");
      throw new SharepointException("Batch hint is -1");
    }
    if (sharepointClientContext == null) {
      LOGGER.severe("SharepointClientContext is null");
      throw new SharepointException("SharepointClientContext is null");
    }


    LOGGER.config("sharepointClientContext.feedType [ "
        + sharepointClientContext.getFeedType() + " ]");
    if (null == sharepointClientContext.getFeedType()) {
      LOGGER.severe("Aborting Traversal. Invalid Feed Type.");
      return null;
    }


    // Set the traversal context on client context so that it can be used by
    // any other classes that will make use of the same.
    sharepointClientContext.setTraversalContext(traversalContext);


    final SharepointClient sharepointClient = new SharepointClient(
        clientFactory, sharepointClientContext);


    sharepointClientContext.setBatchHint(hint);
    SPDocumentList rsAll = null;


    // First, get the documents discovered in the previous crawl cycle.
    // The true flag indicates that we want to check if there are any
    // pending docs from previous crawl cycle
    rsAll = traverse(sharepointClient, true);
    if ((rsAll != null) && (rsAll.size() > 0)) {
      LOGGER.info("Traversal returned " + rsAll.size()
          + " documents discovered in the previous batch traversal(s).");
    } else {
      LOGGER.info("No documents to be sent from previous batch traversal(s). Recrawling...");
      try {
        sharepointClient.updateGlobalState(globalState);
      } catch (final Exception e) {
        LOGGER.log(Level.SEVERE, "Exception while updating global state.... ", e);
      } catch (final Throwable t) {
        LOGGER.log(Level.SEVERE, "Error while updating global state.... ", t);
      }
      // The 'false' flag indicates that we want to scan for all lists for
      // any updates and just not the subset. This is required as the
      // above call to updateGlobalState(globalState) might have
      // discovered docs in one or more (worst case all) list states
      final SPDocumentList rs = traverse(sharepointClient, false);
      if (rs != null) {
        LOGGER.info("Traversal returned " + rs.size()
            + " documents discovered in the current batch traversal.");
        if (rsAll == null) {
          rsAll = rs;
        } else {
          rsAll.addAll(rs);
        }
      } else {
        LOGGER.info("No documents to be sent from the current crawl cycle.");
      }
      if (sharepointClient.isDoCrawl() && (null == rsAll || rsAll.size() == 0)
          && null != globalState.getLastCrawledWeb()) {
        LOGGER.log(Level.INFO, "Setting LastCrawledWebStateID and LastCrawledListStateID as null and updating the state file to reflect that a full crawl has completed...");
        globalState.setLastCrawledWeb(null);
        globalState.setLastCrawledList(null);
        globalState.saveState();
      }
    }


    if (sharepointClientContextOriginal != null) {
      LOGGER.log(Level.FINEST, "Resetting the sharepointClientContext to the original sharepointClientContext at the end of traversal.");
      sharepointClientContext = (SharepointClientContext) sharepointClientContextOriginal.clone();
    }
    if (rsAll != null) {
      LOGGER.info("Traversal returned [" + rsAll.size() + "] documents");
    } else {
      LOGGER.info("Traversal returned [0] documents");
    }


    return rsAll;
  }


  /**
   * Traverses the site for crawled docs. It checks the crawl queue for the
   * given list and creates a document list (instance of {@link SPDocumentList})
   * that will be returned from the current traversal
   * <p>
   * It will either check all lists or only a subset of lists for the current
   * site based on the flag: checkForPendingDocs. Possible cases
   * <ul>
   * <li>If checkForPendingDocs = true, it starts scanning from the web/list for
   * which checkPoint() was called from last batch traversal. Hence only a
   * subset of lists will be scanned.</li>
   * <li>If checkForPendingDocs = false, it starts scanning from the web/list
   * set during the document discovery (
   * {@link SharepointClient#updateGlobalState(GlobalState)})</li>
   * </ul>
   *
   * TODO: In future, this should always scan a subset of lists which have docs
   *       and avoid unnecessary processing of all lists and sites
   *
   * @param sharepointClient The instance of {@link SharepointClient} that will
   *          process the crawl queue to construct the document list
   * @param checkForPendingDocs If true, scans from the list at which
   *          checkPoint() was called. If false, will scan all lists
   * @return {@link SPDocumentList} The document list to be returned from
   *         current batch traversal
   * @since 2.4
   */
  private SPDocumentList traverse(final SharepointClient sharepointClient,
      boolean checkForPendingDocs) {
    if (checkForPendingDocs) {
      WebState ws = globalState.getLastCrawledWeb();
      ListState listState = globalState.getLastCrawledList();
      globalState.setCurrentWeb(ws);
      if (null != ws) {
        ws.setCurrentList(listState);
      }
    }


    // CurrentWeb and CurrentList will define the starting point for
    // the traversal/scan-of-crawl-queues. In case of list, all the
    // lists before CurrentList will not be scanned.
    // TODO: The same is to be done for webs also so that only the relevant
    // WebStates
    // gets scanned. It does not make sense to traverse all the WebStates
    // all the time. Precisely, what we need here is an intelligent
    // liniarIterator instead of a dumb circularIterator.


    SPDocumentList rsAll = null;
    int sizeSoFar = 0;
    LOGGER.log(Level.INFO, "Checking crawl queues of all ListStates/WebStates for pending docs.");
    for (final Iterator<WebState> iter = globalState.getCircularIterator(); iter.hasNext()
        && (sizeSoFar < hint);) {
      final WebState webState = iter.next();
      globalState.setCurrentWeb(webState);
      SPDocumentList rs = null;
      try {
        rs = sharepointClient.traverse(globalState, webState, sizeSoFar, checkForPendingDocs);
      } catch (final Exception e) {
        LOGGER.log(Level.WARNING, "Exception occured while traversing web URL [ "
            + webState.getWebUrl() + " ] ", e);
      } catch (final Throwable t) {
        LOGGER.log(Level.WARNING, "Error occured while traversing web URL [ "
            + webState.getWebUrl() + " ] ", t);
      }
      if ((rs != null) && (rs.size() > 0)) {
        LOGGER.log(Level.INFO, rs.size()
            + " document(s) to be sent from web URL [ " + webState.getWebUrl()
            + " ]. ");
        if (rsAll == null) {
          rsAll = rs;
        } else {
          rsAll.addAll(rs);
        }
        sizeSoFar = rsAll.size();
      } else {
        LOGGER.log(Level.CONFIG, "No documents to be sent from web [ "
            + webState.getWebUrl() + " ] ");
      }
    }
    return rsAll;
  }


  /**
   * Sets the traversal context
   *
   * @param traversalContext The {@link TraversalContext} instance
   */
  public void setTraversalContext(TraversalContext traversalContext) {
    this.traversalContext = traversalContext;
  }
}
Source Code of com.google.enterprise.connector.sharepoint.spiimpl.SharepointTraversalManager

Related Classes of com.google.enterprise.connector.sharepoint.spiimpl.SharepointTraversalManager