/*
* Copyright 2011 Google Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.walkaround.wave.server.googleimport;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import com.google.inject.Inject;
import com.google.walkaround.proto.FindRemoteWavesTask;
import com.google.walkaround.proto.FindWaveletsForRemoteWaveTask;
import com.google.walkaround.proto.ImportSettings;
import com.google.walkaround.proto.ImportTaskPayload;
import com.google.walkaround.proto.ImportWaveletTask;
import com.google.walkaround.proto.RobotSearchDigest;
import com.google.walkaround.proto.gson.FindRemoteWavesTaskGsonImpl;
import com.google.walkaround.proto.gson.FindWaveletsForRemoteWaveTaskGsonImpl;
import com.google.walkaround.proto.gson.ImportTaskPayloadGsonImpl;
import com.google.walkaround.proto.gson.ImportWaveletTaskGsonImpl;
import com.google.walkaround.util.server.RetryHelper;
import com.google.walkaround.util.server.RetryHelper.PermanentFailure;
import com.google.walkaround.util.server.RetryHelper.RetryableFailure;
import com.google.walkaround.util.server.appengine.CheckedDatastore;
import com.google.walkaround.util.server.appengine.CheckedDatastore.CheckedTransaction;
import com.google.walkaround.util.shared.Assert;
import com.google.walkaround.wave.server.auth.StableUserId;
import com.google.walkaround.wave.server.gxp.SourceInstance;
import org.joda.time.LocalDate;
import org.waveprotocol.wave.model.id.IdUtil;
import org.waveprotocol.wave.model.id.WaveId;
import org.waveprotocol.wave.model.id.WaveletId;
import org.waveprotocol.wave.model.util.Pair;
import java.io.IOException;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.logging.Logger;
import javax.annotation.Nullable;
/**
* Processes a {@link FindRemoteWavesTask}.
*
* @author ohler@google.com (Christian Ohler)
*/
public class FindRemoteWavesProcessor {
@SuppressWarnings("unused")
private static final Logger log = Logger.getLogger(FindRemoteWavesProcessor.class.getName());
private final RobotApi.Factory robotApiFactory;
private final SourceInstance.Factory sourceInstanceFactory;
private final StableUserId userId;
private final PerUserTable perUserTable;
private final CheckedDatastore datastore;
private final Random random;
@Inject
public FindRemoteWavesProcessor(RobotApi.Factory robotApiFactory,
SourceInstance.Factory sourceInstanceFactory,
StableUserId userId,
PerUserTable perUserTable,
CheckedDatastore datastore,
Random random) {
this.robotApiFactory = robotApiFactory;
this.sourceInstanceFactory = sourceInstanceFactory;
this.userId = userId;
this.perUserTable = perUserTable;
this.datastore = datastore;
this.random = random;
}
// This used to be 300 but has been raised. Some of the comments elsewhere in
// the code probably still assume 300.
private static final int MAX_RESULTS = 10000;
private String getQueryDateRestriction(String facet, long dateDays) {
LocalDate date = DaysSinceEpoch.toLocalDate(dateDays);
return String.format("%s:%04d/%02d/%02d", facet,
date.getYear(), date.getMonthOfYear(), date.getDayOfMonth());
}
private List<RobotSearchDigest> searchBetween(SourceInstance instance,
long onOrAfterDays, long beforeDays) throws IOException {
RobotApi api = robotApiFactory.create(instance.getApiUrl());
String query = getQueryDateRestriction("after", onOrAfterDays)
// The "before" search operator is inclusive (i.e., it means before the
// end of the day); beforeDays is exclusive.
+ " " + getQueryDateRestriction("before", beforeDays - 1);
return api.search(query, 0, MAX_RESULTS);
}
private long randomBetween(long min, long limit) {
return min + random.nextInt(Ints.checkedCast(limit - min));
}
private List<Pair<Long, Long>> splitInterval(long onOrAfterDays, long beforeDays) {
Preconditions.checkArgument(onOrAfterDays < beforeDays - 1,
"Interval invalid or too small to split further: %s, %s", onOrAfterDays, beforeDays);
// Split into roughly 5 intervals (if possible) because we want a high
// branching factor (300*5^n reaches 1000, 10000 etc. quite a bit faster
// than 300*2^n) and the maximum number of tasks GAE lets us add in one
// transaction is 5.
//
// TreeSet for iteration order.
Set<Long> splitPoints = Sets.newTreeSet();
for (int i = 0; i < 4; i++) {
// NOTE(ohler): Randomized strategy because it's simple to implement (the
// cases where beforeDays - onOrAfterDays < 5 would require some thought
// otherwise) and to make it unlikely that repeated runs send the same
// queries to the googlewave.com servers, which seem to have a bug where
// the result list is sometimes truncated for a query that has been issued
// previously with a lower maxResults limit (perhaps some incorrect
// caching). Randomization means that re-running the "find waves" step
// several times might have a greater chance to discover all waves. But
// I'm not positive whether this helps since I don't understand the bug.
//
// Other options include:
//
// * Instead of this interval splitting, start with "folder:3" or
// "before:2013/01/01" (for all waves), then do "before:<date of oldest
// wave returned by previous search>" until no more waves are returned.
// However, this relies on the assumption that truncated result lists
// are always truncated in such a way that only old waves are missing,
// not new waves. We'd have to verify this. Also, it's completely
// sequential rather than parallelizable.
//
// * Follow up every search for "after:A before:B" with another a search
// for "after:A before:<date of oldest wave returned by previous
// search>". This could be a good combination of the two but relies on
// the same assumption and adds quite a bit more code.
//
// * When the user triggers the "find remote waves" task, enqueue N of
// them rather than just one, to cover the search space N times with
// different random interval splits to improve the likelihood that we
// find everything. Could be good as well but adds code.
//
// * Add random negative search terms like -dgzimhmcoblhqfjciezc to the
// query that are unlikely to restrict the result set but make the query
// unique to avoid the poisoned caches. Could also do many different
// such searches and merge the result sets. (Can't assert that they are
// the same since waves may have been modified and fallen out of the
// date range.) Probably worth implementing.
//
// * Fix the bug in googlewave.com or demonstrate that it's not
// reproducible. Unlikely to happen since it's harder than any of these
// workarounds.
splitPoints.add(randomBetween(onOrAfterDays + 1, beforeDays));
}
splitPoints.add(beforeDays);
ImmutableList.Builder<Pair<Long, Long>> out = ImmutableList.builder();
long left = onOrAfterDays;
for (long right : splitPoints) {
Assert.check(left < right, "left=%s, right=%s", left, right);
out.add(Pair.of(left, right));
left = right;
}
return out.build();
}
private List<ImportTaskPayload> makeTasks(
SourceInstance instance, List<Pair<Long, Long>> intervals,
@Nullable ImportSettings autoImportSettings) {
log.info("intervals=" + intervals + ", settings=" + autoImportSettings);
ImmutableList.Builder<ImportTaskPayload> accu = ImmutableList.builder();
for (Pair<Long, Long> interval : intervals) {
FindRemoteWavesTask task = new FindRemoteWavesTaskGsonImpl();
task.setInstance(instance.serialize());
task.setOnOrAfterDays(interval.getFirst());
task.setBeforeDays(interval.getSecond());
if (autoImportSettings != null) {
task.setAutoImportSettings(autoImportSettings);
}
ImportTaskPayload payload = new ImportTaskPayloadGsonImpl();
payload.setFindWavesTask(task);
accu.add(payload);
}
return accu.build();
}
public List<ImportTaskPayload> makeRandomTasksForInterval(SourceInstance instance,
long onOrAfterDays, long beforeDays, @Nullable ImportSettings autoImportSettings) {
if (onOrAfterDays == beforeDays - 1) {
return makeTasks(instance, ImmutableList.of(Pair.of(onOrAfterDays, beforeDays)),
autoImportSettings);
} else {
return makeTasks(instance, splitInterval(onOrAfterDays, beforeDays),
autoImportSettings);
}
}
// Transaction limit is 500 entities but let's stay well below that.
private static final int MAX_WAVELETS_PER_TRANSACTION = 300;
private void storeResults(List<RemoteConvWavelet> results) throws PermanentFailure {
for (final List<RemoteConvWavelet> partition
: Iterables.partition(results, MAX_WAVELETS_PER_TRANSACTION)) {
new RetryHelper().run(
new RetryHelper.VoidBody() {
@Override public void run() throws RetryableFailure, PermanentFailure {
CheckedTransaction tx = datastore.beginTransaction();
try {
if (perUserTable.addRemoteWavelets(tx, userId, partition)) {
tx.commit();
}
} finally {
tx.close();
}
}
});
}
log.info("Successfully added " + results.size() + " remote waves");
}
private void scheduleFindWaveletTasks(final SourceInstance instance,
List<RobotSearchDigest> results, @Nullable final ImportSettings autoImportSettings)
throws PermanentFailure {
for (final List<RobotSearchDigest> partition : Iterables.partition(results,
// 5 tasks per transaction.
5)) {
new RetryHelper().run(
new RetryHelper.VoidBody() {
@Override public void run() throws RetryableFailure, PermanentFailure {
CheckedTransaction tx = datastore.beginTransaction();
try {
for (RobotSearchDigest result : partition) {
FindWaveletsForRemoteWaveTask task = new FindWaveletsForRemoteWaveTaskGsonImpl();
task.setInstance(instance.serialize());
task.setWaveDigest(result);
if (autoImportSettings != null) {
task.setAutoImportSettings(autoImportSettings);
}
ImportTaskPayload payload = new ImportTaskPayloadGsonImpl();
payload.setFindWaveletsTask(task);
perUserTable.addTask(tx, userId, payload);
}
tx.commit();
} finally {
tx.close();
}
}
});
}
log.info("Successfully scheduled import of " + results.size() + " waves");
}
private void scheduleImportTasks(List<RemoteConvWavelet> results,
final ImportSettings autoImportSettings) throws PermanentFailure {
for (final List<RemoteConvWavelet> partition : Iterables.partition(results,
// 5 tasks per transaction.
5)) {
new RetryHelper().run(
new RetryHelper.VoidBody() {
@Override public void run() throws RetryableFailure, PermanentFailure {
CheckedTransaction tx = datastore.beginTransaction();
try {
for (RemoteConvWavelet wavelet : partition) {
ImportWaveletTask task = new ImportWaveletTaskGsonImpl();
task.setInstance(wavelet.getSourceInstance().serialize());
task.setWaveId(wavelet.getDigest().getWaveId());
task.setWaveletId(wavelet.getWaveletId().serialise());
task.setSettings(autoImportSettings);
ImportTaskPayload payload = new ImportTaskPayloadGsonImpl();
payload.setImportWaveletTask(task);
perUserTable.addTask(tx, userId, payload);
}
tx.commit();
} finally {
tx.close();
}
}
});
}
log.info("Successfully scheduled import of " + results.size() + " waves");
}
private List<RemoteConvWavelet> expandPrivateReplies(SourceInstance instance,
RobotSearchDigest digest) throws IOException {
RobotApi api = robotApiFactory.create(instance.getApiUrl());
ImmutableList.Builder<RemoteConvWavelet> wavelets = ImmutableList.builder();
WaveId waveId = WaveId.deserialise(digest.getWaveId());
// The robot API only allows access to waves with ids that start with "w".
if (!waveId.getId().startsWith(IdUtil.WAVE_PREFIX + "+")) {
log.info("Wave " + waveId + " not accessible through Robot API, skipping");
} else {
log.info("Getting wave view for " + waveId);
List<WaveletId> waveletIds = api.getWaveView(waveId);
log.info("Wave view for " + waveId + ": " + waveletIds);
for (WaveletId waveletId : waveletIds) {
if (IdUtil.isConversationalId(waveletId)) {
wavelets.add(new RemoteConvWavelet(instance, digest, waveletId, null, null));
} else {
log.info("Skipping non-conv wavelet " + waveletId);
}
}
}
return wavelets.build();
}
public List<ImportTaskPayload> findWavelets(FindWaveletsForRemoteWaveTask task)
throws IOException, PermanentFailure {
SourceInstance instance = sourceInstanceFactory.parseUnchecked(task.getInstance());
List<RemoteConvWavelet> wavelets = expandPrivateReplies(instance, task.getWaveDigest());
if (wavelets.isEmpty()) {
return ImmutableList.of();
}
storeResults(wavelets);
if (task.hasAutoImportSettings()) {
scheduleImportTasks(wavelets, task.getAutoImportSettings());
}
return ImmutableList.of();
}
public List<ImportTaskPayload> findWaves(FindRemoteWavesTask task)
throws IOException, PermanentFailure {
SourceInstance instance = sourceInstanceFactory.parseUnchecked(task.getInstance());
long onOrAfterDays = task.getOnOrAfterDays();
long beforeDays = task.getBeforeDays();
List<RobotSearchDigest> results = searchBetween(instance, onOrAfterDays, beforeDays);
log.info("Search found " + results.size() + " waves");
if (results.isEmpty()) {
return ImmutableList.of();
}
@Nullable ImportSettings autoImportSettings =
task.hasAutoImportSettings() ? task.getAutoImportSettings() : null;
// NOTE(ohler): Having many concurrent tasks like this that all need to write to
// the PerUserTable leads to a lot of write contention. We'll just have to
// keep max-concurrent-requests low.
scheduleFindWaveletTasks(instance, results, autoImportSettings);
if (results.size() >= MAX_RESULTS) {
// Result list is most likely truncated, repeat with smaller intervals.
log.info("Got " + results.size() + " results between " + onOrAfterDays + " and " + beforeDays
+ ", splitting");
if (beforeDays - onOrAfterDays <= 1) {
throw new RuntimeException("Can't split further; too many results (" + results.size()
+ ") between " + onOrAfterDays + " and " + beforeDays);
}
return makeRandomTasksForInterval(instance, onOrAfterDays, beforeDays, autoImportSettings);
} else {
return ImmutableList.of();
}
}
}