/*
* Syncany, www.syncany.org
* Copyright (C) 2011-2014 Philipp C. Heckel <philipp.heckel@gmail.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.syncany.chunk;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import org.syncany.chunk.Chunker.ChunkEnumeration;
import org.syncany.database.MultiChunkEntry.MultiChunkId;
/**
* The Deduper implements the core deduplication algorithm used by Syncany.
*
* <p>The algorithm uses a {@link Chunker} to break files into individual
* {@link Chunk}s. These chunks are added to a {@link MultiChunk} using an implementation
* of a {@link MultiChunker}. Before this multichunk is written to a file, it is transformed
* using one or many {@link Transformer}s (can be chained).
*
* <p>This class does not maintain a chunk index itself. Instead, it calls a listener to
* lookup a chunk, and skips further chunk processing if the chunk already exists.
*
* <p>For a detailed description of the algorithm, please refer to chapter 5.3 of the thesis:
* <i>"Minimizing remote storage usage and synchronization time using deduplication and
* multichunking: Syncany as an example"</i>
*
* @see <a href="http://blog.philippheckel.com/2013/05/20/minimizing-remote-storage-usage-and-synchronization-time-using-deduplication-and-multichunking-syncany-as-an-example/">Blog post: Minimizing remote storage usage and synchronization time using deduplication and multichunking: Syncany as an example</a>
* @author Philipp C. Heckel <philipp.heckel@gmail.com>
*/
public class Deduper {
private Chunker chunker;
private MultiChunker multiChunker;
private Transformer transformer;
public Deduper(Chunker chunker, MultiChunker multiChunker, Transformer transformer) {
this.chunker = chunker;
this.multiChunker = multiChunker;
this.transformer = transformer;
}
/**
* Deduplicates the given list of files according to the Syncany chunk algorithm.
*
* <p>A brief description of the algorithm (and further links to a detailed description)
* are given in the {@link Deduper}.
*
* @param files List of files to be deduplicated
* @param listener Listener to react of file/chunk/multichunk events, and to implement the chunk index
* @throws IOException If a file cannot be read or an unexpected exception occurs
*/
public void deduplicate(List<File> files, DeduperListener listener) throws IOException {
Chunk chunk = null;
MultiChunk multiChunk = null;
listener.onStart(files.size());
for (int i=0; i<files.size(); i++) {
File file = files.get(i);
// Filter ignored files
boolean fileAccepted = listener.onFileFilter(file);
if (!fileAccepted) {
continue;
}
// Decide whether to index the contents
boolean dedupContents = listener.onFileStart(file, i);
if (dedupContents) {
// Create chunks from file
ChunkEnumeration chunksEnum = chunker.createChunks(file);
while (chunksEnum.hasMoreElements()) {
chunk = chunksEnum.nextElement();
// old chunk
if (!listener.onChunk(chunk)) {
listener.onFileAddChunk(file, chunk);
continue;
}
// new chunk
else {
// - Check if multichunk full
if (multiChunk != null && multiChunk.isFull()) {
multiChunk.close();
listener.onMultiChunkClose(multiChunk);
multiChunk = null;
}
// - Open new multichunk if non-existent
if (multiChunk == null) {
MultiChunkId newMultiChunkId = listener.createNewMultiChunkId(chunk);
File multiChunkFile = listener.getMultiChunkFile(newMultiChunkId);
multiChunk = multiChunker.createMultiChunk(newMultiChunkId,
transformer.createOutputStream(new FileOutputStream(multiChunkFile)));
listener.onMultiChunkOpen(multiChunk);
}
// - Add chunk data
multiChunk.write(chunk);
listener.onMultiChunkWrite(multiChunk, chunk);
}
listener.onFileAddChunk(file, chunk);
}
// Closing file is necessary!
chunksEnum.close();
}
if (chunk != null) {
listener.onFileEnd(file, chunk.getFileChecksum());
}
else {
listener.onFileEnd(file, null);
}
// Reset chunk (if folder after chunk, the folder would have a checksum b/c of chunk.getFileChecksum())
chunk = null;
}
// Close and add last multichunk
if (multiChunk != null) {
// Data
multiChunk.close();
listener.onMultiChunkClose(multiChunk);
multiChunk = null;
}
listener.onFinish();
}
}