Package org.apache.blur.mapreduce

Source Code of org.apache.blur.mapreduce.BlurReducer$LuceneFileComparator

package org.apache.blur.mapreduce;

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.blur.lucene.LuceneVersionConstant.LUCENE_VERSION;
import static org.apache.blur.utils.BlurConstants.RECORD_ID;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.apache.blur.analysis.FieldManager;
import org.apache.blur.log.Log;
import org.apache.blur.log.LogFactory;
import org.apache.blur.lucene.search.FairSimilarity;
import org.apache.blur.manager.writer.TransactionRecorder;
import org.apache.blur.mapreduce.BlurTask.INDEXING_TYPE;
import org.apache.blur.mapreduce.lib.BlurColumn;
import org.apache.blur.mapreduce.lib.BlurMutate;
import org.apache.blur.mapreduce.lib.BlurMutate.MUTATE_TYPE;
import org.apache.blur.mapreduce.lib.BlurOutputFormat;
import org.apache.blur.mapreduce.lib.BlurRecord;
import org.apache.blur.mapreduce.lib.DefaultBlurReducer;
import org.apache.blur.mapreduce.lib.ProgressableDirectory;
import org.apache.blur.server.TableContext;
import org.apache.blur.store.hdfs.HdfsDirectory;
import org.apache.blur.thrift.generated.Column;
import org.apache.blur.thrift.generated.Record;
import org.apache.blur.thrift.generated.Selector;
import org.apache.blur.thrift.generated.TableDescriptor;
import org.apache.blur.utils.BlurConstants;
import org.apache.blur.utils.BlurUtil;
import org.apache.blur.utils.ResetableDocumentStoredFieldVisitor;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.NoLockFactory;
import org.apache.lucene.util.IOUtils;

/**
* This class is depreciated please use {@link BlurOutputFormat} in combination
* with {@link DefaultBlurReducer}.
*/
@Deprecated
public class BlurReducer extends Reducer<Text, BlurMutate, Text, BlurMutate> {

  static class LuceneFileComparator implements Comparator<String> {

    private Directory _directory;

    public LuceneFileComparator(Directory directory) {
      _directory = directory;
    }

    @Override
    public int compare(String o1, String o2) {
      try {
        long fileLength1 = _directory.fileLength(o1);
        long fileLength2 = _directory.fileLength(o2);
        if (fileLength1 == fileLength2) {
          return o1.compareTo(o2);
        }
        return (int) (fileLength2 - fileLength1);
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
  };

  protected static final Log LOG = LogFactory.getLog(BlurReducer.class);
  protected static final long REPORT_PERIOD = TimeUnit.SECONDS.toMillis(10);
  protected static final double MB = 1024 * 1024;
  protected IndexWriter _writer;
  protected Directory _directory;
  protected Analyzer _analyzer;
  protected BlurTask _blurTask;

  protected Counter _recordCounter;
  protected Counter _rowCounter;
  protected Counter _fieldCounter;
  protected Counter _rowBreak;
  protected Counter _rowFailures;
  protected byte[] _copyBuf;
  protected Configuration _configuration;
  protected long _start;
  protected long _previousRow;
  protected long _previousRecord;
  protected long _prev;
  protected IndexReader _reader;
  protected Map<String, Document> _newDocs = new HashMap<String, Document>();
  protected Set<String> _recordIdsToDelete = new HashSet<String>();
  protected Term _rowIdTerm = new Term(BlurConstants.ROW_ID);
  private FieldManager _fieldManager;

  @Override
  protected void setup(Context context) throws IOException, InterruptedException {
    _blurTask = BlurTask.read(context.getConfiguration());
    _configuration = context.getConfiguration();
    setupCounters(context);
    setupAnalyzer(context);
    setupDirectory(context);
    setupWriter(context);
    if (_blurTask.getIndexingType() == INDEXING_TYPE.UPDATE) {
      _reader = DirectoryReader.open(_directory);
    }
  }

  protected void setupCounters(Context context) {
    _rowCounter = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getRowCounterName());
    _recordCounter = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getRecordCounterName());
    _fieldCounter = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getFieldCounterName());
    _rowBreak = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getRowBreakCounterName());
    _rowFailures = context.getCounter(BlurTask.getCounterGroupName(), BlurTask.getRowFailureCounterName());
    _start = System.currentTimeMillis();
    _prev = System.currentTimeMillis();
  }

  @Override
  protected void reduce(Text key, Iterable<BlurMutate> values, Context context) throws IOException,
      InterruptedException {
    if (!index(key, values, context)) {
      _rowFailures.increment(1);
    }
  }

  protected boolean index(Text key, Iterable<BlurMutate> values, Context context) throws IOException {
    int recordCount = 0;
    _newDocs.clear();
    _recordIdsToDelete.clear();
    boolean rowIdSet = false;

    for (BlurMutate mutate : values) {
      BlurRecord record = mutate.getRecord();
      if (!rowIdSet) {
        String rowId = record.getRowId();
        _rowIdTerm = new Term(BlurConstants.ROW_ID, rowId);
        rowIdSet = true;
      }
      if (mutate.getMutateType() == MUTATE_TYPE.DELETE) {
        _recordIdsToDelete.add(record.getRecordId());
        continue;
      }
      Document document = toDocument(record);
      _newDocs.put(record.getRecordId(), document);

      context.progress();
      recordCount++;
      if (recordCount >= _blurTask.getMaxRecordsPerRow()) {
        return false;
      }
      if (_blurTask.getIndexingType() == INDEXING_TYPE.UPDATE) {
        fetchOldRecords();
      }
    }

    List<Document> docs = documentsToIndex(new ArrayList<Document>(_newDocs.values()));
    if (docs.size() > 0) {
      docs.get(0).add(new StringField(BlurConstants.PRIME_DOC, BlurConstants.PRIME_DOC_VALUE, Store.NO));
    }

    switch (_blurTask.getIndexingType()) {
    case REBUILD:
      _writer.addDocuments(docs);
      break;
    case UPDATE:
      _writer.updateDocuments(_rowIdTerm, docs);
    default:
      break;
    }

    _recordCounter.increment(recordCount);
    _rowCounter.increment(1);
    if (_prev + REPORT_PERIOD < System.currentTimeMillis()) {
      long records = _recordCounter.getValue();
      long rows = _rowCounter.getValue();

      long now = System.currentTimeMillis();

      double overAllSeconds = (now - _start) / 1000.0;
      double overAllRecordRate = records / overAllSeconds;
      double overAllRowsRate = rows / overAllSeconds;

      double seconds = (now - _prev) / 1000.0;
      double recordRate = (records - _previousRecord) / seconds;
      double rowsRate = (rows - _previousRow) / seconds;

      String status = String.format(
          "Totals [%d Row, %d Records], Avg Rates [%.1f Row/s, %.1f Records/s] Rates [%.1f Row/s, %.1f Records/s]",
          rows, records, overAllRowsRate, overAllRecordRate, rowsRate, recordRate);

      LOG.info(status);
      context.setStatus(status);

      _previousRecord = records;
      _previousRow = rows;
      _prev = now;
    }
    return true;
  }

  protected List<Document> documentsToIndex(List<Document> list) {
    return list;
  }

  protected void fetchOldRecords() throws IOException {
    List<Document> docs = BlurUtil.fetchDocuments(_reader, _rowIdTerm, new ResetableDocumentStoredFieldVisitor(),
        new Selector(), Integer.MAX_VALUE, "reducer-context");
    for (Document document : docs) {
      String recordId = document.get(RECORD_ID);
      // add them to the new records if the new records do not contain them.
      if (!_newDocs.containsKey(recordId)) {
        _newDocs.put(recordId, document);
      }
    }

    // delete all records that should be removed.
    for (String recordId : _recordIdsToDelete) {
      _newDocs.remove(recordId);
    }
  }

  @Override
  protected void cleanup(Context context) throws IOException, InterruptedException {
    switch (_blurTask.getIndexingType()) {
    case UPDATE:
      cleanupFromUpdate(context);
      return;
    case REBUILD:
      cleanupFromRebuild(context);
      return;
    default:
      break;
    }
  }

  protected void cleanupFromUpdate(Context context) throws IOException {
    _writer.commit();
    _writer.close();
  }

  protected void cleanupFromRebuild(Context context) throws IOException, InterruptedException {
    _writer.commit();
    _writer.close();

    IndexReader reader = DirectoryReader.open(_directory);

    TableDescriptor descriptor = _blurTask.getTableDescriptor();

    Path directoryPath = _blurTask.getDirectoryPath(context);
    remove(directoryPath);

    NoLockFactory lockFactory = NoLockFactory.getNoLockFactory();

    Directory destDirectory = getDestDirectory(context.getConfiguration(), descriptor, directoryPath);
    destDirectory.setLockFactory(lockFactory);

    boolean optimize = _blurTask.getOptimize();

    if (optimize) {
      context.setStatus("Starting Copy-Optimize Phase");
      IndexWriterConfig conf = new IndexWriterConfig(LUCENE_VERSION, _analyzer);
      TieredMergePolicy policy = (TieredMergePolicy) conf.getMergePolicy();
      policy.setUseCompoundFile(false);
      long s = System.currentTimeMillis();
      IndexWriter writer = new IndexWriter(getBiggerBuffers(destDirectory), conf);
      writer.addIndexes(reader);
      writer.close();
      long e = System.currentTimeMillis();
      context.setStatus("Copying phase took [" + (e - s) + " ms]");
      LOG.info("Copying phase took [" + (e - s) + " ms]");
    } else {
      context.setStatus("Starting Copy-Optimize Phase");
      long s = System.currentTimeMillis();
      List<String> files = getFilesOrderedBySize(_directory);
      long totalBytesToCopy = getTotalBytes(_directory);
      long totalBytesCopied = 0;
      long startTime = System.currentTimeMillis();
      for (String file : files) {
        totalBytesCopied += copy(_directory, destDirectory, file, file, context, totalBytesCopied, totalBytesToCopy,
            startTime);
      }
      long e = System.currentTimeMillis();
      context.setStatus("Copying phase took [" + (e - s) + " ms]");
      LOG.info("Copying phase took [" + (e - s) + " ms]");
    }
  }

  protected Directory getBiggerBuffers(Directory destDirectory) {
    return new BufferedDirectory(destDirectory, 32768);
  }

  protected Directory getDestDirectory(Configuration configuration, TableDescriptor descriptor, Path directoryPath)
      throws IOException {
    return new HdfsDirectory(configuration, directoryPath);
  }

  protected CompressionCodec getInstance(String compressionClass) throws IOException {
    try {
      CompressionCodec codec = (CompressionCodec) Class.forName(compressionClass).newInstance();
      if (codec instanceof Configurable) {
        Configurable configurable = (Configurable) codec;
        configurable.setConf(_configuration);
      }
      return codec;
    } catch (Exception e) {
      throw new IOException(e);
    }
  }

  protected void remove(Path directoryPath) throws IOException {
    FileSystem fileSystem = FileSystem.get(directoryPath.toUri(), _configuration);
    fileSystem.delete(directoryPath, true);
  }

  protected long getTotalBytes(Directory directory) throws IOException {
    long total = 0;
    for (String file : directory.listAll()) {
      total += directory.fileLength(file);
    }
    return total;
  }

  protected long copy(Directory from, Directory to, String src, String dest, Context context, long totalBytesCopied,
      long totalBytesToCopy, long startTime) throws IOException {
    IndexOutput os = to.createOutput(dest, new IOContext());
    IndexInput is = from.openInput(src, new IOContext());
    IOException priorException = null;
    try {
      return copyBytes(is, os, is.length(), context, totalBytesCopied, totalBytesToCopy, startTime, src);
    } catch (IOException ioe) {
      priorException = ioe;
    } finally {
      IOUtils.closeWhileHandlingException(priorException, os, is);
    }
    return 0;// this should never be called
  }

  protected long copyBytes(IndexInput in, IndexOutput out, long numBytes, Context context, long totalBytesCopied,
      long totalBytesToCopy, long startTime, String src) throws IOException {
    if (_copyBuf == null) {
      _copyBuf = new byte[BufferedIndexInput.BUFFER_SIZE];
    }
    long start = System.currentTimeMillis();
    long copied = 0;
    while (numBytes > 0) {
      if (start + REPORT_PERIOD < System.currentTimeMillis()) {
        report(context, totalBytesCopied + copied, totalBytesToCopy, startTime, src);
        start = System.currentTimeMillis();
      }
      final int toCopy = (int) (numBytes > _copyBuf.length ? _copyBuf.length : numBytes);
      in.readBytes(_copyBuf, 0, toCopy);
      out.writeBytes(_copyBuf, 0, toCopy);
      numBytes -= toCopy;
      copied += toCopy;
      context.progress();
    }
    return copied;
  }

  protected List<String> getFilesOrderedBySize(final Directory directory) throws IOException {
    List<String> files = new ArrayList<String>(Arrays.asList(directory.listAll()));
    Collections.sort(files, new LuceneFileComparator(_directory));
    return files;
  }

  protected void setupDirectory(Context context) throws IOException {
    TableDescriptor descriptor = _blurTask.getTableDescriptor();
    switch (_blurTask.getIndexingType()) {
    case UPDATE:
      Path directoryPath = _blurTask.getDirectoryPath(context);
      _directory = getDestDirectory(context.getConfiguration(), descriptor, directoryPath);

      NoLockFactory lockFactory = NoLockFactory.getNoLockFactory();
      _directory.setLockFactory(lockFactory);
      return;
    case REBUILD:
      File dir = new File(System.getProperty("java.io.tmpdir"));
      File path = new File(dir, "index");
      rm(path);
      LOG.info("Using local path [" + path + "] for indexing.");

      Directory localDirectory = FSDirectory.open(path);
      _directory = new ProgressableDirectory(localDirectory, context);
      return;
    default:
      break;
    }
  }

  protected String getNodeName(Context context) {
    return context.getTaskAttemptID().toString();
  }

  protected void rm(File path) {
    if (!path.exists()) {
      return;
    }
    if (path.isDirectory()) {
      for (File f : path.listFiles()) {
        rm(f);
      }
    }
    path.delete();
  }

  protected <T> T nullCheck(T o) {
    if (o == null) {
      throw new NullPointerException();
    }
    return o;
  }

  protected void setupWriter(Context context) throws IOException {
    nullCheck(_directory);
    nullCheck(_analyzer);
    IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, _analyzer);
    config.setSimilarity(new FairSimilarity());
    config.setRAMBufferSizeMB(_blurTask.getRamBufferSizeMB());
    TieredMergePolicy mergePolicy = (TieredMergePolicy) config.getMergePolicy();
    mergePolicy.setUseCompoundFile(false);
    _writer = new IndexWriter(_directory, config);
  }

  protected void setupAnalyzer(Context context) {
    TableContext tableContext = TableContext.create(_blurTask.getTableDescriptor());
    _fieldManager = tableContext.getFieldManager();
    _analyzer = _fieldManager.getAnalyzerForIndex();
  }

  protected Document toDocument(BlurRecord record) throws IOException {
    Document document = new Document();
    document.add(new Field(BlurConstants.ROW_ID, record.getRowId(), TransactionRecorder.ID_TYPE));
    document.add(new Field(BlurConstants.RECORD_ID, record.getRecordId(), TransactionRecorder.ID_TYPE));

    List<Field> doc = TransactionRecorder.getDoc(_fieldManager, record.getRowId(), toRecord(record));
    for (Field field : doc) {
      document.add(field);
    }
    return document;
  }

  private Record toRecord(BlurRecord record) {
    Record r = new Record();
    r.setFamily(record.getFamily());
    r.setRecordId(record.getRecordId());
    List<BlurColumn> columns = record.getColumns();
    for (BlurColumn blurColumn : columns) {
      r.addToColumns(new Column(blurColumn.getName(), blurColumn.getValue()));
    }
    return r;
  }

  protected static void report(Context context, long totalBytesCopied, long totalBytesToCopy, long startTime, String src) {
    long now = System.currentTimeMillis();
    double seconds = (now - startTime) / 1000.0;
    double rate = totalBytesCopied / seconds;
    String time = estimateTimeToComplete(rate, totalBytesCopied, totalBytesToCopy);

    String status = String
        .format(
            "%.1f Complete - Time Remaining [%s s], Copy rate [%.1f MB/s], Total Copied [%.1f MB], Total To Copy [%.1f MB]",
            getPerComplete(totalBytesCopied, totalBytesToCopy), time, getMb(rate), getMb(totalBytesCopied),
            getMb(totalBytesToCopy));
    LOG.info(status);
    context.setStatus(status);
  }

  protected static double getPerComplete(long totalBytesCopied, long totalBytesToCopy) {
    return ((double) totalBytesCopied / (double) totalBytesToCopy) * 100.0;
  }

  protected static double getMb(double b) {
    return b / MB;
  }

  protected static String estimateTimeToComplete(double rate, long totalBytesCopied, long totalBytesToCopy) {
    long whatsLeft = totalBytesToCopy - totalBytesCopied;
    long secondsLeft = (long) (whatsLeft / rate);
    return BlurUtil.humanizeTime(secondsLeft, TimeUnit.SECONDS);
  }
}
TOP

Related Classes of org.apache.blur.mapreduce.BlurReducer$LuceneFileComparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.