Package com.google.enterprise.connector.sharepoint.client

Source Code of com.google.enterprise.connector.sharepoint.client.SharepointClient

// Copyright 2007 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.enterprise.connector.sharepoint.client;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.enterprise.connector.sharepoint.client.AlertsHelper;
import com.google.enterprise.connector.sharepoint.client.SPConstants.FeedType;
import com.google.enterprise.connector.sharepoint.client.SPConstants.SPType;
import com.google.enterprise.connector.sharepoint.client.UserProfile2003Helper;
import com.google.enterprise.connector.sharepoint.client.UserProfile2007Helper;
import com.google.enterprise.connector.sharepoint.spiimpl.SPDocument;
import com.google.enterprise.connector.sharepoint.spiimpl.SPDocumentList;
import com.google.enterprise.connector.sharepoint.spiimpl.SharepointException;
import com.google.enterprise.connector.sharepoint.state.GlobalState;
import com.google.enterprise.connector.sharepoint.state.ListState;
import com.google.enterprise.connector.sharepoint.state.WebState;
import com.google.enterprise.connector.sharepoint.wsclient.client.ClientFactory;
import com.google.enterprise.connector.sharepoint.wsclient.client.ListsWS;
import com.google.enterprise.connector.spi.SpiConstants.ActionType;

import org.apache.axis.utils.XMLUtils;

import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* This class provides a layer of abstraction between the SharePoint Traversal
* Manager and the java clients for making web service calls. Every time
* traversal is started/resumed, connector goes through this layer. This class
* has the inteliigence to know which web service should be consulted for some
* purpose. This class has all the methods needed to get documents and sites
* from the sharepoint server.
*
* @author nitendra_thakur
*/
public class SharepointClient {
  private static final Logger LOGGER = Logger.getLogger(SharepointClient.class.getName());
  private final SharepointClientContext sharepointClientContext;
  private final ClientFactory clientFactory;
  private int nDocuments = 0;

  // true -> when threshold is not reached and all webs
  // all lists all documents are done.
  // false -> when a partial cycle is completed i.e, threshold is
  // reached before processing all the documents.
  private boolean doCrawl;

  // This is mainly for test cases. It gives the count of liststates that are
  // checked for any docs pending from previous crawl cycle
  private int noOfVisitedListStates = 0;

  public SharepointClient(final ClientFactory clientFactory,
      final SharepointClientContext inSharepointClientContext)
          throws SharepointException {
    this.clientFactory = clientFactory;
    sharepointClientContext = inSharepointClientContext;

    // Register a SAX client factory with Axis so that we can intercept SAX
    // parsing failures. This is needed to ignore some SAX parsing failures
    // such as duplicate attributes defined in the metadata of a document.
    XMLUtils.initSAXFactory(
        "com.google.enterprise.connector.sharepoint.wsclient.handlers.SaxErrorFactory",
        true, false);
  }

  /**
   * For a single ListState, handle its crawl queue (if any). This means add it
   * to the ResultSet which we give back to the Connector Manager.
   *
   * @param globalState The recent snapshot of the whole in-memory state file.
   * @param web Represets the current web state
   * @param list Represents the current list state
   * @return {@link SPDocumentList} conatining the crawled documents.
   */
  @VisibleForTesting
  SPDocumentList handleCrawlQueueForList(final GlobalState globalState,
      final WebState web, final ListState list) {
    if (null == web) {
      LOGGER.log(Level.WARNING, "web is not found");
      return null;
    }
    if (null == list) {
      LOGGER.log(Level.WARNING, "list is not found");
      return null;
    }

    final List<SPDocument> crawlQueue = list.getCrawlQueue();
    if (null == crawlQueue || crawlQueue.size() <= 0) {
      LOGGER.log(Level.FINE, "No CrawlQueue..");
      return null;
    }
    ImmutableList.Builder<SPDocument> newListBuilder =
        new ImmutableList.Builder<SPDocument>();
    for (SPDocument doc : list.getCrawlQueue()) {
      ListState parentList = doc.getParentList();
      if (parentList == null) {
        LOGGER.log(Level.WARNING, "Document [{0}] is missing parent list. "
            + "Assigning [{1}] as parent list for document.",
            new Object[] {doc.getUrl(), list.getListURL()});
        doc.setParentList(list);
      } else {
        if (!list.getPrimaryKey().equals(parentList.getPrimaryKey())) {
          LOGGER.log(Level.WARNING,
              "Skipping document . Parent List - crawl queue mismatch"
              + " for document [{0}]. Parent List is [{1}]. "
              + "Crawl Queue is associated with list is [{2}].",
              new Object[] {doc, parentList, list});
          continue;
        }
      }
      doc.setParentWeb(web);
      doc.setSharepointClientContext(sharepointClientContext);
      // Update necessary information required for downloading contents.
      if (FeedType.CONTENT_FEED == doc.getFeedType()) {
        doc.setContentDwnldURL(doc.getUrl());
      }

      newListBuilder.add(doc);
      LOGGER.log(Level.FINEST, "[ DocId = " + doc.getDocId() + ", URL = "
          + doc.getUrl() + " ]");
    }
   
    ImmutableList<SPDocument> newlist = newListBuilder.build();
    if (newlist.isEmpty()) {
      // If all documents are skipped because of possible
      // crawl queue mismatch, then clear crawl queue for list.
      list.setCrawlQueue(null);      
      return null;
    }
   
    // Update crawl queue for list with filtered documents.
    list.setCrawlQueue(newlist);

    final SPDocumentList docList = new SPDocumentList(newlist, globalState);
    // FIXME These could be set in traversal manager just before returning
    // start/resumeTraversal
    if (null != sharepointClientContext) {
      // FIXME These could be set in traversal manager just before
      // returning
      // start/resumeTraversal
      docList.setAliasMap(sharepointClientContext.getAliasMap());
      docList.setFQDNConversion(sharepointClientContext.isFQDNConversion());
      docList.setReWriteDisplayUrlUsingAliasMappingRules(sharepointClientContext.isReWriteDisplayUrlUsingAliasMappingRules());
      docList.setReWriteRecordUrlUsingAliasMappingRules(sharepointClientContext.isReWriteRecordUrlUsingAliasMappingRules());
    } else {
      LOGGER.log(Level.SEVERE, "sharepointClientContext not found!");
    }
    return docList;
  }

  /**
   * Scans the crawl queue of all the ListStates from a given WebState and
   * constructs a {@link SPDocumentList} object to be returned to CM.
   * {@link WebState#getCurrentListstateIterator()} takes care of the fact that
   * same list is not scanned twice in case the traversal has been resumed.
   * <p/>
   * <p>
   * At the end, fetches the ACL of all the documents contained in the
   * {@link SPDocumentList} object. Ensures that ACL are not re-fetched when
   * documents from previous batch traversal are being returned.
   * </p>
   * <p>
   * <b>No documents are returned in case there are failures/errors while
   * retrieving ACLs</b>
   * </p>
   * <p>
   * Logs the {@link OutOfMemoryError} when fetching ACLs. For retry, need to
   * edit properties in connectorInstance.xml and restart
   * <ul>
   * <li>If 'fetchACLInBatches' is enabled, tries to fetch ACLs in smaller
   * batches of (n/aclBatchSizeFactor) (n being the number of documents).</li>
   * <li>Both 'fetchACLInBatches' and 'aclBatchSizeFactor' can be edited from
   * connectorInstance.xml</li>
   * </ul>
   * </p>
   *
   * @param globalState The {@link GlobalState} representing all the SharePoint
   *          sites. Primary required when constructing the
   *          {@link SPDocumentList}
   * @param webState The {@link WebState} whose lists ned to be scanned for
   *          documents
   * @param sizeSoFar This indicates the number documents that have been
   *          previously fetched and added to the global crawl queue. This is
   *          useful in cases when a single list/site does not have sufficient
   *          documents that can match the batchHint and hence multiple
   *          site/lists need to be scanned.
   * @param sendPendingDocs True will indicate that documents retrieved as part
   *          of previous batch traversal need to be sent. This will be the case
   *          when connector returned batch Hint or little more docs, but the CM
   *          did not feed all of them to GSA and checkPoint() was called,
   *          implying there are docs from previous batch traversal to be sent.
   *          In such a case, ACLs should not be re-fetched
   * @return {@link SPDocumentList} containing crawled {@link SPDocument}.
   */
  public SPDocumentList traverse(final GlobalState globalState,
      final WebState webState, int sizeSoFar, boolean sendPendingDocs) {
    if (webState == null) {
      LOGGER.warning("global state is null");
      return null;
    }

    noOfVisitedListStates = 0;
    SPDocumentList resultSet = null;
    Iterator<ListState> iter = sendPendingDocs ? webState.getIterator()
        : webState.getCurrentListstateIterator();
    while (iter.hasNext()) {
      final ListState list = iter.next();
      if (list.isSiteDefaultPage()) {
        continue;
      }

      // Mark this list as current list so that the next traversal
      // request starts from here and already scanned lists are not
      // unnecessarily re-scanned.
      webState.setCurrentList(list);
      if (list.getCrawlQueue() == null) {
        continue;
      }

      SPDocumentList resultsList = null;

      try {
        LOGGER.log(Level.FINE, "Handling crawl queue for list URL [ "
            + list.getListURL() + " ]. ");
        resultsList = handleCrawlQueueForList(globalState, webState, list);
        noOfVisitedListStates++;
      } catch (final Exception e) {
        LOGGER.log(Level.WARNING, "Problem in handling crawl queue for list URL [ "
            + list.getListURL() + " ]. ", e);
      }

      if ((resultsList != null) && (resultsList.size() > 0)) {
        LOGGER.log(Level.INFO, resultsList.size()
            + " document(s) to be sent from list URL [ " + list.getListURL()
            + " ]. ");
        if (resultSet == null) {
          resultSet = resultsList;
        } else {
          resultSet.addAll(resultsList);
        }
      } else {
        LOGGER.log(Level.FINE, "No documents to be sent from list URL [ "
            + list.getListURL() + " ]. ");
      }
      if (resultsList != null) {
        sizeSoFar += resultsList.size();
      }

      // Check if the docs added so far meet the batchHint
      if (sizeSoFar >= sharepointClientContext.getBatchHint()) {
        LOGGER.info("Stopping traversal because batch hint "
            + sharepointClientContext.getBatchHint()
            + " has been reached. Processed documents: " + sizeSoFar);
        break;
      }
    }

    ListState listForWeb = webState.lookupList(webState.getPrimaryKey());
    if (listForWeb != null) {
      SPDocumentList resultsList =
          handleCrawlQueueForList(globalState, webState, listForWeb);
      if (resultsList != null) {
        if (resultSet != null) {
          resultSet.addAll(resultsList);
        } else {
          resultSet = resultsList;
        }       
      }
    }

    // Fetch ACL for all the documents crawled from the current WebState
    if (!handleACLForDocuments(resultSet, webState, globalState, sendPendingDocs)) {
      return null;
    }

    LOGGER.config(noOfVisitedListStates + " lists scanned from site "
        + webState.getWebUrl() + ". found " + resultSet + " docs");

    return resultSet;
  }

  /**
   * If the connector is set to push ACL, fetches the ACL. Takes care to
   * consider that ACL is not retrieved more than once esp. for when documents
   * are pending from previous batch traversals
   *
   * @param resultSet The list of documents discovered in current/previous batch
   *          traversals
   * @param webState The web state representing the site
   * @param globalState The global state representing the list of all sites and
   *          their information
   * @param sendPendingDocs True if the documents were discovered in previous
   *          batch traversal but fed in the current traversal OR false
   *          otherwise
   * @return True if ACL was retrieved successfully OR false in case of any
   *         exceptions/errors
   */
  @VisibleForTesting
  boolean handleACLForDocuments(SPDocumentList resultSet, WebState webState,
      GlobalState globalState, boolean sendPendingDocs) {

    if (!sharepointClientContext.isPushAcls()) {
      // When the connector is not set to feed ACLs no further checks are
      // required, just return true to send docs to CM and GSA
      return true;
    }

    if (resultSet == null || resultSet.size() == 0) {
      return true;
    }

    if (sendPendingDocs) {
      boolean missingAcls = false;
      for (SPDocument document : resultSet.getDocuments()) {
        missingAcls = document.isMissingAcls();
        if (missingAcls) {
          LOGGER.log(Level.WARNING,
              "Document [{0}] is missing ACL. This is an overflow document "
              + "from WebState [{1}]. Fetching ACLs for this batch.",
              new Object[] {document.getUrl(), webState.getWebUrl()});
          break;
        }
      }

      if (!missingAcls) {
        // This is to indicate that ACLs have been retrieved previously and
        // hence just return the set of docs
        return true;
      }
    }

    boolean aclRetrievalResult;
    // Fetch ACL for all the documents crawled from the current WebState
    // Do not try to re-fetch the ACL when documents are pending from
    // previous batch traversals
    int aclBatchSize = sharepointClientContext.getAclBatchSize();
    if (aclBatchSize <= 0) {
      aclRetrievalResult =
          fetchACLForDocuments(resultSet, webState, globalState);
    } else {
      aclRetrievalResult = fetchACLInBatches(resultSet, webState,
          globalState, aclBatchSize);
    }
    // Resolve SP Groups only if ACLs retrieval is successful
    if (aclRetrievalResult) {
      return resolveSharePointGroups(webState);     
    } else {
      LOGGER.log(Level.WARNING, "No documents will be sent for site [ "
        + webState.getWebUrl()
        + " ] as ACL retrieval has failed. Please check the errors/logs" +
        " associated with ACL retrieval before this");
      return false;
    }
  }

  /**
   * Resolves SharePoint Groups for WebState
   * @param webState for which SharePoint Groups needs to be resolved
   * @return boolean flag indicating if SharePoint Group Resolution for
   *         WebState is successful. True = Success. False = Failure
   */
  private boolean resolveSharePointGroups(WebState webState) {
    if (webState.getSPGroupsToResolve() == null ||
        webState.getSPGroupsToResolve().isEmpty()) {
      return true;
    }
    LOGGER.log(Level.INFO, "Resolving SharePoint Groups for ["
        + webState.getWebUrl() + "]");
    try {
      AclHelper aclHelper = new AclHelper(sharepointClientContext,
          webState.getWebUrl());
      return aclHelper.resolveSharePointGroups(webState);     
    } catch (Exception ex) {   
      // Return false indicating that SharePoint Group Resolution is failed.
      LOGGER.log(Level.WARNING,
          "Problem while resolving groups under WebState [ "
              + webState.getWebUrl() + " ].", ex);
      return false;
    }
  }

  /**
   * Fetches the ACL for documents.
   * <p>
   * Based on the size of ACL per document, the WS response can be large and
   * result in {@link java.lang.OutOfMemoryError}. In such a case, the connector
   * will log the error
   * </p>
   *
   * @param resultSet The list of documents for which ACL should be fetched.
   * @param webState The web state representing the site
   * @param globalState The global state representing the list of all sites and
   *          their information
   * @return True if ACL was retrieved successfully OR false in case of any
   *         exceptions/errors
   */
  private boolean fetchACLForDocuments(SPDocumentList resultSet,
      WebState webState, GlobalState globalState) {

    if (resultSet.size() <= 0) {
      LOGGER.log(Level.CONFIG, "Result set is empty. No documents to fetch ACL");
      return false;
    }

    LOGGER.log(Level.INFO, "Fetching ACLs for #" + resultSet.size()
        + " documents crawled from web " + webState.getWebUrl());
    try {
      AclHelper aclHelper = new AclHelper(sharepointClientContext,
          webState.getWebUrl());
      aclHelper.fetchAclForDocuments(resultSet, webState);
    } catch (Throwable t) {
      logError(resultSet, webState, t);
      // Return false indicating that the ACL retrieval for current batch
      // has failed and skipped
      return false;
    }

    // Return true indicating successful retrieval of ACL
    return true;
  }

  /**
   * Common method to log ACL retrieval errors
   *
   * @param resultSet The document list for which ACL retrieval was attempted
   * @param te The error/exception encountered
   */
  private void logError(SPDocumentList resultSet, WebState webState,
      Throwable te) {

    // Check for OOM and indicate that connector service needs to be
    // restarted
    if (te instanceof OutOfMemoryError) {
      LOGGER.log(Level.SEVERE, "Connector encountered fatal error : \"OutOfMemoryError\" which might be due to a large web service response while fetching ACL for "
          + resultSet.size()
          + " documents for documents crawled under WebState [ "
          + webState.getWebUrl()
          + " ]. Please enable 'fetchACLInBatches' flag and modify the 'aclBatchSizeFactor' in connectorInstance.xml and restart the connector service", te);
    } else {
      LOGGER.log(Level.WARNING, "Problem while fetching ACLs for documents crawled under WebState [ "
          + webState.getWebUrl() + " ] . ", te);
    }

    LOGGER.warning("Skipping ACL retrieval for the document list : "
        + resultSet.toString());
  }

  /**
   * Fetches ACL for documents in batches. Required to handle the
   * {@link OutOfMemoryError} kind errors
   * <ul>
   * <li>When re-fetching ACLs, tries to fetch in smaller batches of
   * n/batchSizeFactor (n being he number of documents).</li>
   * </ul>
   *
   * @param resultSet The set of documents whose ACL needs to be re-fetched in
   *          smaller batches
   * @param webState The {@link WebState} to which the documents belong
   * @param globalState The {@link GlobalState} required primarily for the
   *          {@link SPDocumentList}
   * @param batchSize Batch size to be used for fetching ACLs in batches
   * @return True if ACLs were retrieved successfully OR false in case of any
   *         exceptions/errors
   */
  /*
   * The access method is package level for JUnit test cases
   */
  boolean fetchACLInBatches(SPDocumentList resultSet, WebState webState,
      GlobalState globalState, int batchSize) {

    if (resultSet.size() <= 0) {
      LOGGER.log(Level.CONFIG, "Result set is empty. No documents to fetch ACL");
      return false;
    }
    LOGGER.info("The connector will attempt to fetch ACLs for documents in batches of "
        + batchSize);

    int toIndex = 0;
    for (int i = 0; i < resultSet.size(); i += batchSize) {
      // Use the batchSize to identify the subset of docs. The toIndex
      // indicates the end of sub-set with 'i' indicating the start.
      toIndex += batchSize;
      if (toIndex > resultSet.size()) {
        toIndex = resultSet.size();

        // In case the start and end index is same it will result in an
        // empty list. So ignore and proceed to next level
        if (i == toIndex) {
          LOGGER.log(Level.WARNING, "The start and end index of the List of the documents should not be same");
          continue;
        }
      }
      SPDocumentList docList = new SPDocumentList(
          resultSet.getDocuments().subList(i, toIndex), globalState);

      // Fetch ACL
      if (!fetchACLForDocuments(docList, webState, globalState)) {
        // Return false indicating ACL retrieval has failed and the
        // entire batch of documents need to be skipped
        return false;
      }
    }

    return true;
  }

  /**
   * Discover extra webs viz, MySites, Personal Sites, GSSiteDiscover discovered
   * sites etc and store them into allSites.
   *
   * @param allSites
   * @param spType
   * @throws SharepointException
   */
  private void discoverExtraWebs(final Set<String> allSites, final SPType spType)
      throws SharepointException {
    // TODO: Move this to the client factory.
    if (SPType.SP2003 == spType) {
      LOGGER.log(Level.INFO, "Getting the initial list of MySites/Personal "
          + "sites for SharePoint type SP2003. Context URL [ "
          + sharepointClientContext.getSiteURL() + " ]");
      final UserProfile2003Helper userProfile =
          new UserProfile2003Helper(sharepointClientContext);
      if (userProfile.isSPS()) {// Check if SPS2003 or WSS 2.0
        try {
          final Set<String> personalSites = userProfile.getPersonalSiteList();
          allSites.addAll(personalSites);
        } catch (final Exception e) {
          LOGGER.log(Level.WARNING, "Unable to get MySites for the Context URL [ "
              + sharepointClientContext.getSiteURL() + " ]", e);
        }
      }
    } else if (SPType.SP2007 == spType) {
      final String strMySiteURL = sharepointClientContext.getMySiteBaseURL();
      if ((strMySiteURL != null) && (!strMySiteURL.trim().equals(""))) {
        LOGGER.log(Level.INFO, "Getting the initial list of MySites for SharePoint type SP2007 from MySiteBaseURL [ "
            + strMySiteURL + " ]");
        final UserProfile2007Helper userProfile =
            new UserProfile2007Helper(sharepointClientContext);
        if (userProfile.isSPS()) {
          try {
            final Set<String> lstMyLinks = userProfile.getMyLinks();
            allSites.addAll(lstMyLinks);// remove duplicates
          } catch (final Exception e) {
            LOGGER.log(Level.WARNING, "Unable to get MySites from MySiteBaseURL [ "
                + strMySiteURL + " ]", e);
          }

          try {
            final Set<String> personalSites = userProfile.getPersonalSiteList();
            allSites.addAll(personalSites);
          } catch (final Exception e) {
            LOGGER.log(Level.WARNING, "Unable to get Personal Sites for Context URL [ "
                + sharepointClientContext.getSiteURL() + " ]", e);
          }
        }
      }

      // Get all top level sites from the farm. Supported only in SP2007.
      final SiteDiscoveryHelper siteDiscovery =
          new SiteDiscoveryHelper(sharepointClientContext, null);
      final Set<String> sitecollection =
          siteDiscovery.getMatchingSiteCollections();
      allSites.addAll(sitecollection);
    }
  }

  /**
   * iterate through fresh list of webs in allSites and update GS (i.e. add WS
   * if not there already)
   *
   * @param globalState
   * @param allSites
   * @return a set of all new webs that have been added to the globalstate
   */
  private Set<WebState> updateGlobalState(final GlobalState globalState,
      final Set<String> allSites) {
    Set<WebState> newWebs = new HashSet<WebState>();
    if ((null == allSites) || (allSites.size() == 0)) {
      return newWebs;
    }
    for (String url : allSites) {
      final WebState webState = updateGlobalState(globalState, url);
      if (null != webState) {
        newWebs.add(webState);
      }
    }
    return newWebs;
  }

  /**
   * Check for a web if it exists in the global state. If not, then creates a
   * corresponding web state and adds it into the global state.
   *
   * @param globalState
   * @param url
   * @return {@link WebState} null if the webstate was already existing in the
   *         globalstate. Otherwise a valid reference to the newly created
   *         WebState
   */
  private WebState updateGlobalState(final GlobalState globalState,
      final String url) {
    WebState web = null;
    if (null == url) {
      LOGGER.log(Level.WARNING, "url not found!");
      return web;
    }
    String webUrl = url;
    WebState wsGS = globalState.lookupWeb(url, null);

    /*
     * The incoming url might not always be exactly the web URL that is used
     * while creation of web state and is required by Web Services as such.
     * Hence, a second check is required.
     */
    if (null == wsGS) {
      final String webAppURL = Util.getWebApp(url);
      WebsHelper webs = null;
      try {
        sharepointClientContext.setSiteURL(webAppURL);
        webs = new WebsHelper(sharepointClientContext);
      } catch (final Exception e) {
        LOGGER.log(Level.WARNING, "WebsHelper creation failed for URL [ "
            + url + " ]. ", e);
      }
      if (null != webs) {
        webUrl = webs.getWebURLFromPageURL(url);
        if (!url.equals(webUrl)) {
          wsGS = globalState.lookupWeb(webUrl, null);
        }
      }
    }

    if (null == wsGS) {// new web
      LOGGER.config("Making WebState for : " + webUrl);
      try {
        int responseCode = sharepointClientContext.checkConnectivity(
            Util.encodeURL(webUrl) + SPConstants.LISTS_END_POINT, null);
        if (responseCode != 400 && responseCode != 404) {
          web = globalState.makeWebState(sharepointClientContext, webUrl);
        } else {
          LOGGER.warning("Unable to connect to list web service for web. "
              + "Skipping WebState creation for URL [ " + webUrl + " ].");
          sharepointClientContext.logExcludedURL("[ " + webUrl
              + " ] identified as invalid Web Url");
        }
      } catch (final Exception e) {
        LOGGER.log(Level.WARNING, "Problem while creating web state for url [ "
            + webUrl + " ]. ", e);
      }
    } else {
      wsGS.setExisting(true);
    }

    return web;
  }

  /**
   * Discovers the child sites, MySites, Personal Sites, Sites discovered by
   * GSSite discovery. State information is updated as and when the webs are
   * discovered. A further call to updateWebStateFromSite is made to discover
   * the lists/libraries and the documents from each discovered web.
   *
   * @param globalState The recent state information
   */
  // FIXME SharePointClientContext should not be passed as an argument in the
  // methods that are called from here. Instead, use the class member.
  public void updateGlobalState(final GlobalState globalState)
      throws SharepointException {
    if (globalState == null) {
      LOGGER.warning("global state does not exist");
      return;
    }

    if (sharepointClientContext == null) {
      LOGGER.warning("sharepointClientContext is not found");
      return;
    }
    SharepointClientContext tempCtx = (SharepointClientContext) sharepointClientContext.clone();

    SiteDiscoveryHelper webCrawlInfoFetcher = null;
    if (sharepointClientContext.isUseSPSearchVisibility()) {
      webCrawlInfoFetcher = new SiteDiscoveryHelper(tempCtx, null);
    }

    // At the start of a new traversal cycle, we update the WebCrawlInfo of
    // all the webs
    if (globalState.isBFullReCrawl() && null != webCrawlInfoFetcher) {
      webCrawlInfoFetcher.updateWebCrawlInfoInBatch(globalState.getAllWebStateSet());
    }

    nDocuments = 0;
    doCrawl = true;

    ListState nextList = globalState.getLastCrawledList();
    WebState nextWeb = globalState.getLastCrawledWeb();

    if (null == nextWeb) {
      nextWeb = globalState.lookupWeb(sharepointClientContext.getSiteURL(),
          sharepointClientContext);
    } else {
      sharepointClientContext.setSiteURL(nextWeb.getWebUrl());
    }

    // start and end recrawl is used for detecting non-existent webs/lists
    globalState.startRecrawl();

    if (null == nextWeb) {
      nextWeb = updateGlobalState(globalState, sharepointClientContext.getSiteURL());
      if (null == nextWeb) {
        throw new SharepointException(
            "Starting WebState for the current traversal can not be determined.");
      }
      if (null != webCrawlInfoFetcher) {
        nextWeb.setWebCrawlInfo(webCrawlInfoFetcher.getCurrentWebCrawlInfo());
      }
    }

    LOGGER.info("Starting traversal from site [ " + nextWeb + " ]. ");

    SPType spType = nextWeb.getSharePointType();

    // To store the intermediate webs discovered during crawl
    Set<String> allSites = new TreeSet<String>();

    ArrayList<String> lstLookupForWebs = new ArrayList<String>();

    // Traverse sites and lists from the last crawled site and list to fetch
    // batch hint # of docs
    nextWeb = traverseSites(globalState, allSites, tempCtx, nextWeb, nextList, lstLookupForWebs);

    // This will contain all the newly discovered webs and is used to
    // identify those webs which should be queried for their search
    // visibility options set on SharePoint.
    Set<WebState> newWebs = new HashSet<WebState>();

    // Update all the web info into the globalstate. The newly discovered
    // webs, if any, will be processed in the same batch traversal in case
    // the batch hint # of documents have not been discovered
    newWebs.addAll(updateGlobalState(globalState, allSites));

    // Cases being handled here:
    // 1. Batch hint # of documents have not been discovered, but there are
    // new sites which have been discovered. Crawl documents till you get
    // the batch hint # of docs
    // 2. Batch hint # of documents have not been discovered and no new
    // sites have been discovered. In such cases get any new
    // personal/mysites, sites discovered by GSS. Add them to the global
    // state and crawl them till batch hint # of documents is reached.
    if (doCrawl && spType != null) {
      // If the first check has passed, it might mean Case 1. If the
      // following if block is skipped, it means this is Case 1, else it
      // will be Case 2
      if (newWebs.size() == 0) {
        // If this check passed, it means Case 2
        if (LOGGER.isLoggable(Level.CONFIG)) {
          LOGGER.log(Level.CONFIG, "Discovering new sites");
        }

        // Empty the current set of sites that have been traversed
        // before discovering the new ones. This is important in case
        // the current batch traversal has not discovered batch-hint no.
        // of docs. In such cases the connector should not traverse the
        // sites already traversed in the same batch traversal.
        allSites.clear();

        // Initiate the discovery of new sites
        discoverExtraWebs(allSites, spType);
        newWebs.addAll(updateGlobalState(globalState, allSites));
      }

      // The following does not care if the sites are discovered for Case
      // 1 or Case 2. It will simply go ahead and crawl batch hint no. of
      // docs from the new sites
      if (newWebs.size() > 0) {
        LOGGER.log(Level.INFO, "global state has been updated with #"
            + newWebs.size()
            + " newly discovered sites. About to traverse them for docs");
        if (null != webCrawlInfoFetcher) {
          webCrawlInfoFetcher.updateWebCrawlInfoInBatch(newWebs);
        }

        // Traverse sites and lists under them to fetch batch hint # of
        // docs
        traverseSites(globalState, allSites, tempCtx, nextWeb, nextList, lstLookupForWebs);
        newWebs.clear();

        // There are chances that new sites are discovered (child sites
        // OR linked sites) during the traversal of sites discovered as
        // linked sites themselves OR as child sites OR through GSS. In
        // such cases, the connector should just create webstates and
        // add them to the global state. The next batch traversal will
        // take them up for traversal
        newWebs.addAll(updateGlobalState(globalState, allSites));
        if (newWebs.size() > 0) {
          if (null != webCrawlInfoFetcher) {
            webCrawlInfoFetcher.updateWebCrawlInfoInBatch(newWebs);
          }
          doCrawl = false;
        }
      }
    } else if (newWebs.size() > 0 && null != webCrawlInfoFetcher) {
      // This is the case when we have reached the batch-hint while
      // crawling the first web itself and hence no further discovery
      // has been done. At this point, we must update the WebcrawlInfo of
      // all the child/linked sites that might have been discovered as
      // part of the site's crawling. If we do not do this here, these
      // webs will become known webs in the next batch traversal and we do
      // not query WebCrawlInfo of known webs in between a traversal
      // cycle.
      webCrawlInfoFetcher.updateWebCrawlInfoInBatch(newWebs);
    }

    globalState.setBFullReCrawl(doCrawl);
    globalState.endRecrawl(sharepointClientContext);

    if (null != sharepointClientContext.getUserDataStoreDAO()
        && sharepointClientContext.getUserDataStoreDAO().getUdsCacheSize() > 0) {
      sharepointClientContext.getUserDataStoreDAO().cleanupCache();
    }
    LOGGER.log(Level.INFO, "Returning after crawl cycle.. ");
  }

  public boolean isDoCrawl() {
    return doCrawl;
  }

  /**
   * Makes a call to WSClient layer to get the alerts for a site and updates the
   * global state. Alerts, in SharePoint are created at web level. Though, in
   * the state file that connector maintains a SPDoc can only be inside a
   * ListState. Hence, we need to create a dummy list here. ListID =
   * siteName_Alerts: to make it unique for alerts and LastMod: current time
   *
   * @param webState
   * @param tempCtx
   */
  private void processAlerts(final WebState webState,
      final SharepointClientContext tempCtx) {
    if (null == webState) {
      return;
    }
    String internalName = webState.getPrimaryKey();
    if (!internalName.endsWith("/")) {
      internalName += "/";
    }
    internalName += "_" + SPConstants.ALERTS_TYPE;

    final Calendar cLastMod = Calendar.getInstance();
    cLastMod.setTime(new Date());
    ListState currentDummyAlertList = null;

    try {
      currentDummyAlertList = new ListState(internalName,
          SPConstants.ALERTS_TYPE, SPConstants.ALERTS_TYPE, cLastMod,
          SPConstants.ALERTS_TYPE, internalName, webState);
    } catch (final Exception e) {
      LOGGER.log(Level.WARNING, "Unable to create the dummy list state for alerts. ", e);
      return;
    }
    if (currentDummyAlertList == null) {
      LOGGER.log(Level.WARNING, "Unable to create the dummy list state for alerts.");
      return;
    }

    // find the list in the Web state
    ListState dummyAlertListState = webState.lookupList(currentDummyAlertList.getPrimaryKey());
    if (dummyAlertListState == null) {
      dummyAlertListState = currentDummyAlertList;
    }
    LOGGER.log(Level.INFO, "Getting alerts. internalName [ " + internalName
        + " ] ");
    List<SPDocument> listCollectionAlerts = null;

    try {
      final AlertsHelper alerts = new AlertsHelper(tempCtx);
      listCollectionAlerts = alerts.getAlerts(webState, dummyAlertListState);
    } catch (final Exception e) {
      LOGGER.log(Level.WARNING, "Problem while getting alerts. ", e);
    }
    if (dummyAlertListState.isExisting()) {
      webState.AddOrUpdateListStateInWebState(dummyAlertListState, currentDummyAlertList.getLastMod());
      dummyAlertListState.setCrawlQueue(listCollectionAlerts);
      if (listCollectionAlerts != null) {
        nDocuments += listCollectionAlerts.size();
      }
    }
  }

  /**
   * Gets all the docs from the SPDocument Library and all the items and their
   * attachments from Generic Lists and Issues in sharepoint under a given site.
   * It first calls SiteData web service to get all the Lists. And then calls
   * Lists web service to get the list items for the lists which are of the type
   * SPDocument Library, Generic Lists or Issues. For attachments in Generic
   * List items and Issues, it calls Lists web service to get attachments for
   * these list items.
   *
   * @param tempCtx Current connector context
   * @param webState The state information of the web which is to be crawled for
   *          documents
   * @param nextList Last List traversed. If the current web contains this list,
   *          the traversal will start from here.
   * @param allWebs Contains all the webs that has been discovered from link
   *          sites/Site directory.
   */
  private void updateWebStateFromSite(final SharepointClientContext tempCtx,
      final WebState webState, ListState nextList, final Set<String> allWebs)
          throws SharepointException {
    List<SPDocument> listItems = new ArrayList<SPDocument>();

    // get all the lists for the given web // e.g. picture,wiki,document
    // libraries etc.
    final SiteDataHelper siteData = new SiteDataHelper(tempCtx);
    List<ListState> listCollection = siteData.getNamedLists(webState);

    // Remove duplicate lists, if any.
    // TODO: We do not need to do this. Web Service does not return
    // duplicate lists.
    listCollection = new ArrayList<ListState>(new TreeSet<ListState>(
        listCollection));

    try {
      SiteDiscoveryHelper gssd = new SiteDiscoveryHelper(
          tempCtx, webState.getWebUrl());
      gssd.updateListCrawlInfo(listCollection);
    } catch (Exception e) {
      LOGGER.log(Level.WARNING, "Exception occurred when trying to to update the ListCrawlInfo for web [ "
          + webState.getWebUrl() + " ] ", e);
    }

    // Updating the latest metadata info for all list states. We may do this
    // updation when the crawl will begin; that will save this extra
    // iteration over the ListStates. But, there is one metadata which
    // must be updated before the change (ACL) detection and crawl begins.
    // That metadata is ListState.InheritiedSecurity flag which is very
    // important while processing ACL related changes.
    // TODO: with some re-structuring of code, we can still avoid this extra
    // iteration.
    for (ListState currentListState : listCollection) {
      ListState listState = webState.lookupList(currentListState.getPrimaryKey());
      if (null != listState) {
        if (!listState.getListURL().equalsIgnoreCase(
            currentListState.getListURL())) {
          tempCtx.logToFile(SPConstants.DEFAULT_VIEW_URL_CHANGE_LOG,
              listState.getListURL());         
        }
        listState.updateList(currentListState);       
      }
    }

    /*
     * If the nextList belongs the current web and is still existing on the
     * SharePoint site, start traversing for this list onwards.
     */
    if (null != nextList && nextList.getParentWebState().equals(webState)
        && listCollection.contains(nextList)) {
      Collections.rotate(listCollection, -(listCollection.indexOf(nextList)));
    }

    AclHelper aclHelper = new AclHelper(tempCtx, webState.getWebUrl());
    try {
      aclHelper.fetchAclChangesSinceTokenAndUpdateState(webState);
    } catch (final Exception e) {
      LOGGER.log(Level.WARNING, "Problem Interacting with Custom ACL WS. web site [ "
          + webState.getWebUrl() + " ]. ", e);
    }

    List<SPDocument> aclChangedItems = null;
    final ListsHelper listsHelper = new ListsHelper(tempCtx);
    for (int i = 0; i < listCollection.size(); i++) {
      final ListState currentList = listCollection.get(i);
      ListState listState = webState.lookupList(currentList.getPrimaryKey());

      if (sharepointClientContext.isUseSPSearchVisibility()) {
        // If this list is marked for No Crawling, do not crawl this
        // list.
        // Please note that, if this list is already known to the
        // connector, it'll keep existing in the connector's state. This
        // implies that if a list is marked as NoCrawl list on
        // SharePoint in between the connector's traversal, crawling of
        // this list will be paused at whatever state it is in. As soon
        // as the NoCrawl flag on SharePoint is reverted, the crawling
        // will be resumed from the saved state.
        if (currentList.isNoCrawl()) {
          LOGGER.log(Level.WARNING, "Skipping List URL [ "
              + currentList.getListURL()
              + " ] while crawling because it has been marked for No Crawling on SharePoint. ");
          if (null == listState) {
            // Make this list known by keeping it in the state. But,
            // do not crawl
            webState.AddOrUpdateListStateInWebState(currentList, currentList.getLastMod());
          }
          continue;
        }
      }

      /*
       * If we already knew about this list, then only fetch docs that have
       * changed since the last doc we processed. If it's a new list (e.g. the
       * first SharePoint traversal), we fetch everything.
       */
      if (listState == null) {
        listState = currentList;
        listState.setNewList(true);
        webState.AddOrUpdateListStateInWebState(listState, listState.getLastMod());
        LOGGER.info("discovered new listState. List URL: "
            + listState.getListURL());
        if (SPType.SP2007 == webState.getSharePointType()) {
          if (FeedType.CONTENT_FEED == sharepointClientContext.getFeedType()) {
            // In case of content feed, we need to keep track of
            // folders and the items under that. This is required
            // for sending delete feeds for the documents when their
            // parent folder is deleted.
            LOGGER.log(Level.CONFIG, "Discovering all folders under current list/library [ "
                + listState.getListURL() + " ] ");
            try {
              listsHelper.getSubFoldersRecursively(listState, null, null);
            } catch (final Exception e) {
              LOGGER.log(Level.WARNING, "Exception occured while getting the folders hierarchy for list [ "
                  + listState.getListURL() + " ]. ", e);
            } catch (final Throwable t) {
              LOGGER.log(Level.WARNING, "Error occured while getting the folders hierarchy for list [ "
                  + listState.getListURL() + " ]. ", t);
            }
          }

          try {
            listItems = listsHelper.getListItemChangesSinceToken(listState, allWebs);
          } catch (final Exception e) {
            LOGGER.log(Level.WARNING, "Exception thrown while getting the documents under list [ "
                + listState.getListURL() + " ].", e);
          } catch (final Throwable t) {
            LOGGER.log(Level.WARNING, "Error thrown while getting the documents under list [ "
                + listState.getListURL() + " ].", t);
          }
        } else {
          try {
            listItems = listsHelper.getListItems(listState, null, null, allWebs);
          } catch (final Exception e) {
            LOGGER.log(Level.WARNING, "Exception thrown while getting the documents under list [ "
                + listState.getListURL() + " ].", e);
          }
        }
      } else {
        LOGGER.info("revisiting listState [ " + listState.getListURL() + " ]. ");
        listState.setExisting(true);
        listState.setNextPage(null);

        String lastDocID = null;

        SPDocument lastDoc = listState.getLastDocForWSRefresh();

        /*
         * We must ensure that the last doc that we are using was actually sent
         * as ADD feed and not as DELETE feed. It might be possible that in one
         * cycle we identify a list as non-existing and hence started sending
         * delete feeds for it. But, in the next cycle that list has been
         * restored, in that case we can not rely on the lastDoc which has been
         * set by a delete feed. We also need to reset the change token in that
         * case to start a full crawl.
         */
        if (lastDoc != null) {
          if (FeedType.CONTENT_FEED == sharepointClientContext.getFeedType()
              && ActionType.DELETE.equals(lastDoc.getAction())) {
            listState.resetState();
            if (FeedType.CONTENT_FEED == sharepointClientContext.getFeedType()) {
              // In case of content feed, we need to keep track of
              // folders and the items under that. This is
              // required for sending delete feeds for the
              // documents when their parent folder is deleted.
              LOGGER.log(Level.CONFIG, "Discovering all folders under current list/library [ "
                  + listState.getListURL() + " ] ");
              try {
                listsHelper.getSubFoldersRecursively(listState, null, null);
              } catch (final Exception e) {
                LOGGER.log(Level.WARNING, "Exception occured while getting the folders hierarchy for list [ "
                    + listState.getListURL() + " ]. ", e);
              } catch (final Throwable t) {
                LOGGER.log(Level.WARNING, "Error occured while getting the folders hierarchy for list [ "
                    + listState.getListURL() + " ]. ", t);
              }
            }
            LOGGER.info("recrawling the items under listState [ "
                + listState.getListURL()
                + " ] because this list has been restored after deletion.");
          } else {
            lastDocID = Util.getOriginalDocId(lastDoc.getDocId(), sharepointClientContext.getFeedType());
          }
        }

        if (SPType.SP2007.equals(webState.getSharePointType())) {
          try {
            webState.AddOrUpdateListStateInWebState(listState, currentList.getLastMod());

            // Any documents to be crawled because of ACL Changes
            aclChangedItems = aclHelper.
                getListItemsForAclChangeAndUpdateState(listState, listsHelper);

            if (null == aclChangedItems
                || aclChangedItems.size() < sharepointClientContext.getBatchHint()) {
              // Do regular incremental crawl
              listItems = listsHelper.getListItemChangesSinceToken(listState, allWebs);
            }
          } catch (final Exception e) {
            LOGGER.log(Level.WARNING, "Exception thrown while getting the documents under list [ "
                + listState.getListURL() + " ].", e);
          } catch (final Throwable t) {
            LOGGER.log(Level.WARNING, "Error thrown while getting the documents under list [ "
                + listState.getListURL() + " ].", t);
          }
        } else {
          try {
            final Calendar dateSince = listState.getDateForWSRefresh();
            webState.AddOrUpdateListStateInWebState(listState, currentList.getLastMod());
            LOGGER.info("fetching changes since " + Util.formatDate(dateSince)
                + " for list [ " + listState.getListURL() + " ]. ");

            // check if date modified for the document library
            final Calendar dateCurrent = listState.getLastModCal();
            if (dateSince.before(dateCurrent)) {
              listState.setNewList(true);
            }

            listItems = listsHelper.getListItems(listState, dateSince, lastDocID, allWebs);
          } catch (final Exception e) {
            LOGGER.log(Level.WARNING, "Exception thrown while getting the documents under list [ "
                + listState.getListURL() + " ].", e);
          } catch (final Throwable t) {
            LOGGER.log(Level.WARNING, "Error thrown while getting the documents under list [ "
                + listState.getListURL() + " ].", t);
          }
        }
      }

      // Get the attachments for each discovered items, if the list allows
      // attachments
      if (listState.canContainAttachments() && (listItems != null)) {
        final List<SPDocument> attachmentItems = new ArrayList<SPDocument>();
        for (int j = 0; j < listItems.size(); j++) {
          final SPDocument doc = listItems.get(j);
          if (ActionType.ADD.equals(doc.getAction())) {
            final List<SPDocument> attachments = listsHelper.getAttachments(listState, doc);
            attachmentItems.addAll(attachments);
          }
        }
        listItems.addAll(attachmentItems);
      }

      if (listState.getNextPage() == null) {
        if (((listItems != null) && (listItems.size() > 0))
            || (listState.isNewList())) {
          SPDocument listDoc = listState.getDocumentInstance(
              sharepointClientContext.getFeedType());
          listItems.add(listDoc);
          listState.setNewList(false);
        }
      } else {
        // Send List home page as part of this batch to complete inheritance
        // chain for discovered child items for partially traversed List.
        if (listState.isNewList() && listItems != null && listItems.size() > 0
            && sharepointClientContext.getTraversalContext()
            .supportsInheritedAcls() && !Strings.isNullOrEmpty(
            listState.getListItemCollectionPositionNext())) {
          SPDocument listDoc = listState.getDocumentInstance(
              sharepointClientContext.getFeedType());
          listItems.add(listDoc);
        }

        // If any of the list has not been traversed completely, doCrawl
        // must not be set true.
        doCrawl = false;
      }

      // Add aclChangedItems to the docs crawled under regular crawling.
      // This is the right place to do this because all the operations
      // pertaining to regular crawling have been made. But, the
      // batch-hint check is yet to be done
      if (null != aclChangedItems) {
        if (null != listItems) {
          listItems.addAll(aclChangedItems);
        } else {
          listItems = aclChangedItems;
        }
      }

      listState.setCrawlQueue(listItems);
      // Set the last crawled date time. This is informative value for the
      // user viewing the state file
      listState.setLastCrawledDateTime(Util.getCurrentTimestampString());

      if (null == listItems || listItems.size() == 0) {
        LOGGER.log(Level.CONFIG, "No items found from list " + listState);
      } else {
        Collections.sort(listItems);
        LOGGER.log(Level.INFO, "found " + listItems.size()
            + " items from list " + listState);
        nDocuments += listItems.size();
        final int batchHint = sharepointClientContext.getBatchHint();

        // As per Issue 116 we need to stop at batchHint or a little
        // more
        if (nDocuments >= batchHint) {
          doCrawl = false;
          break;
        }
      }
    }// end:; for Lists

    // Set the last crawled date time. This is informative value for the
    // user viewing the state file
    webState.setLastCrawledDateTime(Util.getCurrentTimestampString());

    // Mark the current list as null so that the next time crawl queues are
    // scanned, all the ListStates are traversed and no documents that have
    // just been discovered gets skipped.
    webState.setCurrentList(null);
  }

  /**
   * Traverses list of sites (webstates) which have not yet been crawled and
   * discovers new docs to be sent to GSA
   *
   * @param globalState The global state which has the list of sites (webstates)
   *          that need to be crawled for documents
   * @param allSites The list of sites
   * @param sharePointClientContext The current connector context. Instance of
   *          {@link SharepointClientContext}
   * @param nextWeb last site (webstate) that was crawled
   * @param nextList last liststate that as crawled
   * @param lstLookupForWebs webs which are already traversed and should not be
   *          traversed again
   * @throws SharepointException In case of any problems fetching documents
   * @return Last Web crawled. This helps caller an idea about from where the
   *         next crawl should begin.
   */
  // TODO: Why do we pass SharePointClientContext object as argument here?
  // It's already available as a member of this class. Is there any
  // intentional differences between the states of these two
  // SharePointClientContexts?
  private WebState traverseSites(GlobalState globalState, Set<String> allSites,
      SharepointClientContext sharePointClientContext, WebState nextWeb,
      ListState nextList, ArrayList<String> lstLookupForWebs)
          throws SharepointException {
    globalState.setCurrentWeb(nextWeb);
    final Iterator<WebState> itWebs = globalState.getCircularIterator();
    while (itWebs.hasNext()) {
      WebState ws = itWebs.next(); // Get the first web
      if (ws == null) {
        continue;
      }

      final String webURL = ws.getPrimaryKey();

      // Note: Lookup table maintains keeps track of the links which has
      // been visited till now.
      // This helps to curb the cyclic link problem in which SiteA can
      // have link to SiteB and SiteB having link to SiteA.
      if (lstLookupForWebs.contains(webURL)) {
        continue;
      } else {
        lstLookupForWebs.add(webURL);
      }

      try {
        sharePointClientContext.setSiteURL(webURL);
      } catch (Exception e) {
        LOGGER.log(Level.WARNING, "Exception occurred when trying to set the webUrl [ "
            + webURL + " ] context", e);
        continue;
      }

      if (sharepointClientContext.isUseSPSearchVisibility()) {
        // Even if a web is not crawled due to the SP search visibility,
        // it's reference is kept in the connector's state. This is to
        // avoid unnecessary discovery (and WebState construction) of
        // these webs again and again.
        if (ws.isNoCrawl()) {
          LOGGER.log(Level.WARNING, "Skipping Web URL [ "
              + webURL
              + " ] while crawling because it has been marked for No Crawling on SharePoint. ");
          continue;
        }
      }

      nextWeb = ws;
      LOGGER.config("Crawling site [ " + webURL + " ] ");
      final int currDocCount = nDocuments;
      try {
        // Process the web site, and add the link site info to allSites.
        updateWebStateFromSite(sharePointClientContext, ws, nextList, allSites);

        if (currDocCount == nDocuments) {
          // get Alerts for the web and update webState. The above
          // check is added to reduce the frequency with which
          // getAlerts WS call is made.
          LOGGER.fine("Getting alerts under site [ " + webURL + " ]");
          processAlerts(ws, sharePointClientContext);      
        }       
        ListState listForWeb = ws.lookupList(ws.getPrimaryKey());
        if (listForWeb != null) {
          LOGGER.fine("List State for web [ " + listForWeb.getListURL()
              + " ] is not null. Last Doc from List State is "
              + listForWeb.getLastDocProcessed());
        }
        boolean isFirstBatch = ((listForWeb == null)
            || (listForWeb.getLastDocProcessed() == null));
        // Crawl the site home page and web application policy in the
        // first batch and when a web application policy change is detected.
        if (ws.isWebApplicationPolicyChange()
            || isFirstBatch) {                       
          // Get site data for the web and update webState.       
          LOGGER.fine("Getting landing page data for the site [ " + webURL
              + " ]");
          processSiteData(ws, sharepointClientContext);
        }
      } catch (final Exception e) {
        LOGGER.log(Level.WARNING, "Following exception occured while traversing/updating web state URL [ "
            + webURL + " ]. ", e);
      } catch (final Throwable t) {
        LOGGER.log(Level.WARNING, "Following error occured while traversing/updating web state URL [ "
            + webURL + " ]. ", t);
      }

      // Check if the threshold (i.e. batchHint is reached)
      final int batchHint = sharepointClientContext.getBatchHint();

      // As per Issue 116 we need to stop at batchHint or a little more
      if (nDocuments >= batchHint) {
        LOGGER.info("Stopping crawl cycle as connector has discovered (>= batchHint) # of docs. In total : "
            + nDocuments + " docs. batch-hint is " + batchHint);
        doCrawl = false;
        break;
      }

      // Get the next web and discover its direct children
      sharepointClientContext.setSiteURL(webURL);
      WebsHelper webs = new WebsHelper(sharepointClientContext);
      try {
        final Set<String> allWebStateSet = webs.getDirectChildsites();
        final int size = allWebStateSet.size();
        if (size > 0) {
          LOGGER.log(Level.INFO, "Discovered " + size + " child sites under [ "
              + webURL + "]. ");
        } else {
          LOGGER.log(Level.CONFIG, "Discovered " + size
              + " child sites under [ " + webURL + "]. ");
        }
        allSites.addAll(allWebStateSet);
      } catch (final Exception e) {
        LOGGER.log(Level.WARNING, "Unable to get the Child sites for site "
            + webURL, e);
      }
    }
    return nextWeb;
  }

  /**
   * Returns the no of visited list states to check for pending docs from
   * previous batch traversal for a given web state (site)
   *
   * @return The no of visited list states
   */
  public int getNoOfVisitedListStates() {
    return noOfVisitedListStates;
  }

  /**
   * Makes a call to SiteData web service to get data for a site and update
   * global state. Site data in SharePoint is created at site level. Though, in
   * the state file that connector maintains a SPDocument can only be inside a
   * ListState. Hence we need to create a dummy list here.
   *
   * @param webState for which SPDcocument needs to be constructed.
   * @param tempCtx is the temporary SharepointClientContext object.
   */
  private void processSiteData(final WebState webState,
      final SharepointClientContext tempCtx) {
    if (null == webState) {
      return;
    }

    final Calendar cLastMod = Calendar.getInstance();
    cLastMod.setTime(new Date());
    ListState currentDummySiteDataList = null;

    try {
      currentDummySiteDataList = new ListState(webState.getPrimaryKey(),
          webState.getTitle(), webState.getPrimaryKey(), cLastMod,
          SPConstants.SITE, webState.getPrimaryKey(), webState);
    } catch (final Exception e) {
      LOGGER.log(Level.WARNING, "Unable to create the dummy list state for site. "
          + webState.getWebUrl(), e);
      return;
    }

    // find the list in the Web state
    ListState dummySiteListState =
        webState.lookupList(currentDummySiteDataList.getPrimaryKey());
    if (dummySiteListState == null) {
      dummySiteListState = currentDummySiteDataList;
    }
    LOGGER.log(Level.INFO, "Getting site data. internalName [ "
        + webState.getWebUrl() + " ] ");
    List<SPDocument> documentList = new ArrayList<SPDocument>();
    SPDocument document = null;

    try {
      // SharePoint Client Context used to create SiteDataWS should point to
      // WebState URL. If not then SharePoint default page will point to
      // incorrect Web ID for Web State.
      SharepointClientContext ctxToPass =
          (SharepointClientContext) tempCtx.clone();
      ctxToPass.setSiteURL(webState.getWebUrl());
      final SiteDataHelper siteData = new SiteDataHelper(ctxToPass);
      // need to check whether the site exist or not and is not null
      if (webState.isExisting() && null != webState) {
        document = siteData.getSiteData(webState);
        document.setParentList(dummySiteListState);
        // Site Home Page document will be added as last doc from
        // dummy list state. This is required for sending delete feed.
      }
    } catch (final Exception e) {
      LOGGER.log(Level.WARNING, "Problem while getting site data. ", e);
   
    // Web Application Policy Document processing.
    // Web Application Policy Document will be associated with each webstate.
    if (sharepointClientContext.isPushAcls()) {
      try {
        AclHelper aclHelper = new AclHelper(sharepointClientContext,
            webState.getWebUrl());
        SPDocument webAppPolicy = aclHelper.getWebApplicationPolicy(webState,
            sharepointClientContext.getFeedType().toString());
        if (webAppPolicy != null) {
          webAppPolicy.setParentList(dummySiteListState);
          documentList.add(webAppPolicy);
        }
      } catch (final Exception e) {
        LOGGER.log(Level.WARNING, "Problem while getting web app policy. ", e);
      }
    }
    if ((dummySiteListState.isExisting() ||
        webState.isWebApplicationPolicyChange())
        && null != document) {
      // Mark dummy list state to true in order to differentiate this list state
      // with
      // other lists in web state.
      //adding list page document.
      documentList.add(document);
      dummySiteListState.setSiteDefaultPage(true);
      webState.AddOrUpdateListStateInWebState(dummySiteListState, currentDummySiteDataList.getLastMod());
      dummySiteListState.setCrawlQueue(documentList);
      // Resetting web application policy change flag. This will ensure
      // same webstate will not be processed again
      // for web application policy change.
      webState.setWebApplicationPolicyChange(false);
    }
    nDocuments += documentList.size();
  }
}
TOP

Related Classes of com.google.enterprise.connector.sharepoint.client.SharepointClient

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.