Package com.google.walkaround.wave.server.googleimport

Source Code of com.google.walkaround.wave.server.googleimport.FindRemoteWavesProcessor

/*
* Copyright 2011 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.google.walkaround.wave.server.googleimport;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import com.google.inject.Inject;
import com.google.walkaround.proto.FindRemoteWavesTask;
import com.google.walkaround.proto.FindWaveletsForRemoteWaveTask;
import com.google.walkaround.proto.ImportSettings;
import com.google.walkaround.proto.ImportTaskPayload;
import com.google.walkaround.proto.ImportWaveletTask;
import com.google.walkaround.proto.RobotSearchDigest;
import com.google.walkaround.proto.gson.FindRemoteWavesTaskGsonImpl;
import com.google.walkaround.proto.gson.FindWaveletsForRemoteWaveTaskGsonImpl;
import com.google.walkaround.proto.gson.ImportTaskPayloadGsonImpl;
import com.google.walkaround.proto.gson.ImportWaveletTaskGsonImpl;
import com.google.walkaround.util.server.RetryHelper;
import com.google.walkaround.util.server.RetryHelper.PermanentFailure;
import com.google.walkaround.util.server.RetryHelper.RetryableFailure;
import com.google.walkaround.util.server.appengine.CheckedDatastore;
import com.google.walkaround.util.server.appengine.CheckedDatastore.CheckedTransaction;
import com.google.walkaround.util.shared.Assert;
import com.google.walkaround.wave.server.auth.StableUserId;
import com.google.walkaround.wave.server.gxp.SourceInstance;

import org.joda.time.LocalDate;
import org.waveprotocol.wave.model.id.IdUtil;
import org.waveprotocol.wave.model.id.WaveId;
import org.waveprotocol.wave.model.id.WaveletId;
import org.waveprotocol.wave.model.util.Pair;

import java.io.IOException;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.logging.Logger;

import javax.annotation.Nullable;

/**
* Processes a {@link FindRemoteWavesTask}.
*
* @author ohler@google.com (Christian Ohler)
*/
public class FindRemoteWavesProcessor {

  @SuppressWarnings("unused")
  private static final Logger log = Logger.getLogger(FindRemoteWavesProcessor.class.getName());

  private final RobotApi.Factory robotApiFactory;
  private final SourceInstance.Factory sourceInstanceFactory;
  private final StableUserId userId;
  private final PerUserTable perUserTable;
  private final CheckedDatastore datastore;
  private final Random random;

  @Inject
  public FindRemoteWavesProcessor(RobotApi.Factory robotApiFactory,
      SourceInstance.Factory sourceInstanceFactory,
      StableUserId userId,
      PerUserTable perUserTable,
      CheckedDatastore datastore,
      Random random) {
    this.robotApiFactory = robotApiFactory;
    this.sourceInstanceFactory = sourceInstanceFactory;
    this.userId = userId;
    this.perUserTable = perUserTable;
    this.datastore = datastore;
    this.random = random;
  }

  // This used to be 300 but has been raised.  Some of the comments elsewhere in
  // the code probably still assume 300.
  private static final int MAX_RESULTS = 10000;

  private String getQueryDateRestriction(String facet, long dateDays) {
    LocalDate date = DaysSinceEpoch.toLocalDate(dateDays);
    return String.format("%s:%04d/%02d/%02d", facet,
        date.getYear(), date.getMonthOfYear(), date.getDayOfMonth());
  }

  private List<RobotSearchDigest> searchBetween(SourceInstance instance,
      long onOrAfterDays, long beforeDays) throws IOException {
    RobotApi api = robotApiFactory.create(instance.getApiUrl());
    String query = getQueryDateRestriction("after", onOrAfterDays)
        // The "before" search operator is inclusive (i.e., it means before the
        // end of the day); beforeDays is exclusive.
        + " " + getQueryDateRestriction("before", beforeDays - 1);
    return api.search(query, 0, MAX_RESULTS);
  }

  private long randomBetween(long min, long limit) {
    return min + random.nextInt(Ints.checkedCast(limit - min));
  }

  private List<Pair<Long, Long>> splitInterval(long onOrAfterDays, long beforeDays) {
    Preconditions.checkArgument(onOrAfterDays < beforeDays - 1,
        "Interval invalid or too small to split further: %s, %s", onOrAfterDays, beforeDays);
    // Split into roughly 5 intervals (if possible) because we want a high
    // branching factor (300*5^n reaches 1000, 10000 etc. quite a bit faster
    // than 300*2^n) and the maximum number of tasks GAE lets us add in one
    // transaction is 5.
    //
    // TreeSet for iteration order.
    Set<Long> splitPoints = Sets.newTreeSet();
    for (int i = 0; i < 4; i++) {
      // NOTE(ohler): Randomized strategy because it's simple to implement (the
      // cases where beforeDays - onOrAfterDays < 5 would require some thought
      // otherwise) and to make it unlikely that repeated runs send the same
      // queries to the googlewave.com servers, which seem to have a bug where
      // the result list is sometimes truncated for a query that has been issued
      // previously with a lower maxResults limit (perhaps some incorrect
      // caching).  Randomization means that re-running the "find waves" step
      // several times might have a greater chance to discover all waves.  But
      // I'm not positive whether this helps since I don't understand the bug.
      //
      // Other options include:
      //
      // * Instead of this interval splitting, start with "folder:3" or
      //   "before:2013/01/01" (for all waves), then do "before:<date of oldest
      //   wave returned by previous search>" until no more waves are returned.
      //   However, this relies on the assumption that truncated result lists
      //   are always truncated in such a way that only old waves are missing,
      //   not new waves.  We'd have to verify this.  Also, it's completely
      //   sequential rather than parallelizable.
      //
      // * Follow up every search for "after:A before:B" with another a search
      //   for "after:A before:<date of oldest wave returned by previous
      //   search>".  This could be a good combination of the two but relies on
      //   the same assumption and adds quite a bit more code.
      //
      // * When the user triggers the "find remote waves" task, enqueue N of
      //   them rather than just one, to cover the search space N times with
      //   different random interval splits to improve the likelihood that we
      //   find everything.  Could be good as well but adds code.
      //
      // * Add random negative search terms like -dgzimhmcoblhqfjciezc to the
      //   query that are unlikely to restrict the result set but make the query
      //   unique to avoid the poisoned caches.  Could also do many different
      //   such searches and merge the result sets.  (Can't assert that they are
      //   the same since waves may have been modified and fallen out of the
      //   date range.)  Probably worth implementing.
      //
      // * Fix the bug in googlewave.com or demonstrate that it's not
      //   reproducible.  Unlikely to happen since it's harder than any of these
      //   workarounds.
      splitPoints.add(randomBetween(onOrAfterDays + 1, beforeDays));
    }
    splitPoints.add(beforeDays);
    ImmutableList.Builder<Pair<Long, Long>> out = ImmutableList.builder();
    long left = onOrAfterDays;
    for (long right : splitPoints) {
      Assert.check(left < right, "left=%s, right=%s", left, right);
      out.add(Pair.of(left, right));
      left = right;
    }
    return out.build();
  }

  private List<ImportTaskPayload> makeTasks(
      SourceInstance instance, List<Pair<Long, Long>> intervals,
      @Nullable ImportSettings autoImportSettings) {
    log.info("intervals=" + intervals + ", settings=" + autoImportSettings);
    ImmutableList.Builder<ImportTaskPayload> accu = ImmutableList.builder();
    for (Pair<Long, Long> interval : intervals) {
      FindRemoteWavesTask task = new FindRemoteWavesTaskGsonImpl();
      task.setInstance(instance.serialize());
      task.setOnOrAfterDays(interval.getFirst());
      task.setBeforeDays(interval.getSecond());
      if (autoImportSettings != null) {
        task.setAutoImportSettings(autoImportSettings);
      }
      ImportTaskPayload payload = new ImportTaskPayloadGsonImpl();
      payload.setFindWavesTask(task);
      accu.add(payload);
    }
    return accu.build();
  }

  public List<ImportTaskPayload> makeRandomTasksForInterval(SourceInstance instance,
      long onOrAfterDays, long beforeDays, @Nullable ImportSettings autoImportSettings) {
    if (onOrAfterDays == beforeDays - 1) {
      return makeTasks(instance, ImmutableList.of(Pair.of(onOrAfterDays, beforeDays)),
          autoImportSettings);
    } else {
      return makeTasks(instance, splitInterval(onOrAfterDays, beforeDays),
          autoImportSettings);
    }
  }

  // Transaction limit is 500 entities but let's stay well below that.
  private static final int MAX_WAVELETS_PER_TRANSACTION = 300;

  private void storeResults(List<RemoteConvWavelet> results) throws PermanentFailure {
    for (final List<RemoteConvWavelet> partition
        : Iterables.partition(results, MAX_WAVELETS_PER_TRANSACTION)) {
      new RetryHelper().run(
          new RetryHelper.VoidBody() {
            @Override public void run() throws RetryableFailure, PermanentFailure {
              CheckedTransaction tx = datastore.beginTransaction();
              try {
                if (perUserTable.addRemoteWavelets(tx, userId, partition)) {
                  tx.commit();
                }
              } finally {
                tx.close();
              }
            }
          });
    }
    log.info("Successfully added " + results.size() + " remote waves");
  }

  private void scheduleFindWaveletTasks(final SourceInstance instance,
      List<RobotSearchDigest> results, @Nullable final ImportSettings autoImportSettings)
      throws PermanentFailure {
    for (final List<RobotSearchDigest> partition : Iterables.partition(results,
            // 5 tasks per transaction.
            5)) {
      new RetryHelper().run(
          new RetryHelper.VoidBody() {
            @Override public void run() throws RetryableFailure, PermanentFailure {
              CheckedTransaction tx = datastore.beginTransaction();
              try {
                for (RobotSearchDigest result : partition) {
                  FindWaveletsForRemoteWaveTask task = new FindWaveletsForRemoteWaveTaskGsonImpl();
                  task.setInstance(instance.serialize());
                  task.setWaveDigest(result);
                  if (autoImportSettings != null) {
                    task.setAutoImportSettings(autoImportSettings);
                  }
                  ImportTaskPayload payload = new ImportTaskPayloadGsonImpl();
                  payload.setFindWaveletsTask(task);
                  perUserTable.addTask(tx, userId, payload);
                }
                tx.commit();
              } finally {
                tx.close();
              }
            }
          });
    }
    log.info("Successfully scheduled import of " + results.size() + " waves");
  }

  private void scheduleImportTasks(List<RemoteConvWavelet> results,
      final ImportSettings autoImportSettings) throws PermanentFailure {
    for (final List<RemoteConvWavelet> partition : Iterables.partition(results,
            // 5 tasks per transaction.
            5)) {
      new RetryHelper().run(
          new RetryHelper.VoidBody() {
            @Override public void run() throws RetryableFailure, PermanentFailure {
              CheckedTransaction tx = datastore.beginTransaction();
              try {
                for (RemoteConvWavelet wavelet : partition) {
                  ImportWaveletTask task = new ImportWaveletTaskGsonImpl();
                  task.setInstance(wavelet.getSourceInstance().serialize());
                  task.setWaveId(wavelet.getDigest().getWaveId());
                  task.setWaveletId(wavelet.getWaveletId().serialise());
                  task.setSettings(autoImportSettings);
                  ImportTaskPayload payload = new ImportTaskPayloadGsonImpl();
                  payload.setImportWaveletTask(task);
                  perUserTable.addTask(tx, userId, payload);
                }
                tx.commit();
              } finally {
                tx.close();
              }
            }
          });
    }
    log.info("Successfully scheduled import of " + results.size() + " waves");
  }

  private List<RemoteConvWavelet> expandPrivateReplies(SourceInstance instance,
      RobotSearchDigest digest) throws IOException {
    RobotApi api = robotApiFactory.create(instance.getApiUrl());
    ImmutableList.Builder<RemoteConvWavelet> wavelets = ImmutableList.builder();
    WaveId waveId = WaveId.deserialise(digest.getWaveId());
    // The robot API only allows access to waves with ids that start with "w".
    if (!waveId.getId().startsWith(IdUtil.WAVE_PREFIX + "+")) {
      log.info("Wave " + waveId + " not accessible through Robot API, skipping");
    } else {
      log.info("Getting wave view for " + waveId);
      List<WaveletId> waveletIds = api.getWaveView(waveId);
      log.info("Wave view for " + waveId + ": " + waveletIds);
      for (WaveletId waveletId : waveletIds) {
        if (IdUtil.isConversationalId(waveletId)) {
          wavelets.add(new RemoteConvWavelet(instance, digest, waveletId, null, null));
        } else {
          log.info("Skipping non-conv wavelet " + waveletId);
        }
      }
    }
    return wavelets.build();
  }

  public List<ImportTaskPayload> findWavelets(FindWaveletsForRemoteWaveTask task)
      throws IOException, PermanentFailure {
    SourceInstance instance = sourceInstanceFactory.parseUnchecked(task.getInstance());
    List<RemoteConvWavelet> wavelets = expandPrivateReplies(instance, task.getWaveDigest());
    if (wavelets.isEmpty()) {
      return ImmutableList.of();
    }
    storeResults(wavelets);
    if (task.hasAutoImportSettings()) {
      scheduleImportTasks(wavelets, task.getAutoImportSettings());
    }
    return ImmutableList.of();
  }

  public List<ImportTaskPayload> findWaves(FindRemoteWavesTask task)
      throws IOException, PermanentFailure {
    SourceInstance instance = sourceInstanceFactory.parseUnchecked(task.getInstance());
    long onOrAfterDays = task.getOnOrAfterDays();
    long beforeDays = task.getBeforeDays();
    List<RobotSearchDigest> results = searchBetween(instance, onOrAfterDays, beforeDays);
    log.info("Search found " + results.size() + " waves");
    if (results.isEmpty()) {
      return ImmutableList.of();
    }
    @Nullable ImportSettings autoImportSettings =
        task.hasAutoImportSettings() ? task.getAutoImportSettings() : null;
    // NOTE(ohler): Having many concurrent tasks like this that all need to write to
    // the PerUserTable leads to a lot of write contention.  We'll just have to
    // keep max-concurrent-requests low.
    scheduleFindWaveletTasks(instance, results, autoImportSettings);
    if (results.size() >= MAX_RESULTS) {
      // Result list is most likely truncated, repeat with smaller intervals.
      log.info("Got " + results.size() + " results between "  + onOrAfterDays + " and " + beforeDays
          + ", splitting");
      if (beforeDays - onOrAfterDays <= 1) {
        throw new RuntimeException("Can't split further; too many results (" + results.size()
            + ") between " + onOrAfterDays + " and " + beforeDays);
      }
      return makeRandomTasksForInterval(instance, onOrAfterDays, beforeDays, autoImportSettings);
    } else {
      return ImmutableList.of();
    }
  }

}
TOP

Related Classes of com.google.walkaround.wave.server.googleimport.FindRemoteWavesProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.