Package lupos.engine.indexconstruction

Source Code of lupos.engine.indexconstruction.FastRDF3XIndexConstruction

/**
* Copyright (c) 2013, Institute of Information Systems (Sven Groppe and contributors of LUPOSDATE), University of Luebeck
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
* following conditions are met:
*
*   - Redistributions of source code must retain the above copyright notice, this list of conditions and the following
*     disclaimer.
*   - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
*     following disclaimer in the documentation and/or other materials provided with the distribution.
*   - Neither the name of the University of Luebeck nor the names of its contributors may be used to endorse or promote
*     products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package lupos.engine.indexconstruction;

import lupos.compression.Compression;
import lupos.datastructures.buffermanager.BufferManager;
import lupos.datastructures.dbmergesortedds.DBMergeSortedBag;
import lupos.datastructures.dbmergesortedds.DiskCollection;
import lupos.datastructures.dbmergesortedds.MapEntry;
import lupos.datastructures.dbmergesortedds.SortConfiguration;
import lupos.datastructures.dbmergesortedds.heap.Heap;
import lupos.datastructures.dbmergesortedds.heap.Heap.HEAPTYPE;
import lupos.datastructures.items.Triple;
import lupos.datastructures.items.TripleKey;
import lupos.datastructures.items.literal.LazyLiteral;
import lupos.datastructures.items.literal.Literal;
import lupos.datastructures.items.literal.LiteralFactory;
import lupos.datastructures.items.literal.URILiteral;
import lupos.datastructures.items.literal.codemap.StringIntegerMapJava;
import lupos.datastructures.paged_dbbptree.DBBPTree.Generator;
import lupos.datastructures.paged_dbbptree.node.nodedeserializer.StringIntegerNodeDeSerializer;
import lupos.datastructures.patriciatrie.TrieSet;
import lupos.datastructures.patriciatrie.diskseq.DBSeqTrieSet;
import lupos.datastructures.patriciatrie.exception.TrieNotCopyableException;
import lupos.datastructures.patriciatrie.ram.RBTrieMap;
import lupos.datastructures.queryresult.SIPParallelIterator;
import lupos.datastructures.stringarray.StringArray;
import lupos.engine.evaluators.CommonCoreQueryEvaluator;
import lupos.engine.evaluators.RDF3XQueryEvaluator;
import lupos.engine.operators.index.Indices;
import lupos.engine.operators.index.Indices.DATA_STRUCT;
import lupos.engine.operators.index.adaptedRDF3X.RDF3XIndexScan.CollationOrder;
import lupos.engine.operators.index.adaptedRDF3X.SixIndices;
import lupos.engine.operators.tripleoperator.TripleConsumer;
import lupos.io.helper.InputHelper;
import lupos.io.helper.OutHelper;
import lupos.misc.FileHelper;
import lupos.misc.TimeInterval;
import lupos.misc.util.ImmutableIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

/**
* This class constructs the RDF3X indices on disk using a dictionary, which is
* also constructed on disk...
*/
public class FastRDF3XIndexConstruction {

  private static final Logger log = LoggerFactory.getLogger(FastRDF3XIndexConstruction.class);

  // the parameters are used for the B+-trees
  private static final int k = 1000;
  private static final int k_ = 1000;

  private static final int NUMBER_OF_THREADS = 8;

  // how many triples are loaded into main memory to be sorted in the initial runs?
  public static int LIMIT_TRIPLES_IN_MEMORY = 50000000;

  // just for mapping from 0 to 2 to S, P and O
  protected final static String[] map = new String[]{"S", "P", "O"};


  /**
   * Constructs the large-scale indices for RDF3X.
   * The command line arguments are
   * <datafile> <dataformat> <encoding> <NONE|BZIP2|HUFFMAN|GZIP> <directory for indices> [LIMIT_TRIPLES_IN_MEMORY [<datafile2> [<datafile3> ...]]]
   * If you want to import more than one file you can use the additional parameters <datafilei>!
   *
   * @param args
   *            command line arguments
   */
  public static void main(final String[] args) {
    try {

      log.info("Starting program to construct an RDF3X Index for LUPOSDATE...");
      log.debug("[help is printed when using less than 5 command line arguments]");
      log.debug("_______________________________________________________________");

      if (args.length < 5) {
        log.error("Usage: java -Xmx768M lupos.engine.indexconstruction.FastRDF3XIndexConstruction <datafile> <dataformat> <encoding> <NONE|BZIP2|HUFFMAN|GZIP> <directory for indices> [LIMIT_TRIPLES_IN_MEMORY [<datafile2> [<datafile3> ...]]]");
        log.error("Example: java -Xmx768M lupos.engine.indexconstruction.FastRDF3XIndexConstruction data.n3 N3 UTF-8 NONE /luposdateindex 500000");
        return;
      }

      // analyze command line parameters
      final Date start = new Date();
      log.debug("Starting time: {}", start);

      LiteralFactory.setType(LiteralFactory.MapType.NOCODEMAP);
      Indices.setUsedDatastructure(DATA_STRUCT.DBBPTREE);

      final String datafile = args[0];
      final String dataFormat = args[1];
      CommonCoreQueryEvaluator.encoding = args[2];

      final String compressor = args[3];
      if(compressor.compareTo("BZIP2")==0){
        SortConfiguration.setDEFAULT_COMPRESSION(Compression.BZIP2);
      } else if(compressor.compareTo("HUFFMAN")==0){
        SortConfiguration.setDEFAULT_COMPRESSION(Compression.HUFFMAN);
      } else if(compressor.compareTo("GZIP")==0){
        SortConfiguration.setDEFAULT_COMPRESSION(Compression.GZIP);
      } else {
        SortConfiguration.setDEFAULT_COMPRESSION(Compression.NONE);
      }

      String dir = args[4];
      if(!dir.endsWith("\\") && !dir.endsWith("/")) {
        dir += "/";
      }
      // make directory such that we can store something inside!
      final File f_dir = new File(dir);
      f_dir.mkdirs();

      final String[] dirArray = new String[] { dir };
      final String writeindexinfo = dirArray[0]+File.separator+RDF3XQueryEvaluator.INDICESINFOFILE;
      DBMergeSortedBag.setTmpDir(dirArray);
      DiskCollection.setTmpDir(dirArray);
      lupos.datastructures.paged_dbbptree.DBBPTree.setTmpDir(dir, true);

      final Collection<URILiteral> defaultGraphs = new LinkedList<URILiteral>();
      defaultGraphs.add(LiteralFactory.createURILiteralWithoutLazyLiteral("<file:" + datafile+ ">"));

      if(args.length>5) {
        FastRDF3XIndexConstruction.LIMIT_TRIPLES_IN_MEMORY = Integer.parseInt(args[5]);
      }
      for(int i=6; i<args.length; i++) {
        defaultGraphs.add(LiteralFactory.createURILiteralWithoutLazyLiteral("<file:" + args[i]+ ">"));
      }

      // first generate block-wise dictionaries for each block and id-triples with local ids of the local dictionary
      final CreateLocalDictionaryAndLocalIds runGenerator = new CreateLocalDictionaryAndLocalIds(dir);
      for(final URILiteral uri: defaultGraphs) {
        try {
          CommonCoreQueryEvaluator.readTriples(dataFormat, uri.openStream(), runGenerator);
        } catch (final Exception e) {
          log.error(e.getMessage(), e);
        }
      }
      runGenerator.endOfBlock();

      // merge local dictionaries
      final List<TrieSet> listOfTries = runGenerator.getTries();
      final TrieSet final_trie = new DBSeqTrieSet(dir + "FinalTrie");
      if(listOfTries.size()>1){
        final_trie.merge(listOfTries);
      } else {
        final_trie.copy(listOfTries.get(0));
      }

      // create real dictionary
      final Generator<String, Integer> smsi = new Generator<String, Integer>() {

        @Override
        public Iterator<java.util.Map.Entry<String, Integer>> iterator() {
          return new ImmutableIterator<java.util.Map.Entry<String, Integer>>() {

            Iterator<String> it = final_trie.iterator();
            int index = 1;

            @Override
            public boolean hasNext() {
              return this.it.hasNext();
            }

            @Override
            public Entry<String, Integer> next() {
              return new MapEntry<String, Integer>(this.it.next(), this.index++);
            }
          };
        }

        @Override
        public int size() {
          return final_trie.size();
        }
      };

      final lupos.datastructures.paged_dbbptree.DBBPTree<String, Integer> simap = new lupos.datastructures.paged_dbbptree.DBBPTree<String, Integer>(
            k,
            k_,
            new StringIntegerNodeDeSerializer());

      final Thread thread0 = new Thread() {
        @Override
        public void run() {

          try {
            simap.generateDBBPTree(smsi);
            LazyLiteral.setHm(new StringIntegerMapJava(simap));
          } catch (final IOException e) {
            log.error(e.getMessage(), e);
          }
        }
      };
      final Thread thread1 = new Thread() {
        @Override
        public void run() {
          StringArray ismap;
          try {
            ismap = new StringArray();
            ismap.generate(final_trie.iterator());
            LazyLiteral.setV(ismap);
          } catch (final IOException e) {
            log.error(e.getMessage(), e);
          }
        }
      };
      // TODO make thread-safe!
      LiteralFactory.setTypeWithoutInitializing(LiteralFactory.MapType.LAZYLITERALWITHOUTINITIALPREFIXCODEMAP);
      thread0.run();
      thread1.run();
      try {
        thread0.join();
        thread1.join();
      } catch (final InterruptedException e) {
        log.error(e.getMessage(), e);
      }
      final_trie.release();

      // map local ids to global ids of initial runs
      int runNumber = 0;
      for(final TrieSet trie: listOfTries) {
        // determine mapping
        final int[] mapping = new int[trie.size()];
        final SIPParallelIterator<java.util.Map.Entry<String, Integer>, String> iterator = (SIPParallelIterator<java.util.Map.Entry<String, Integer>, String>) simap.entrySet().iterator();

        int local_id =0;
        for(final String key: trie){
          final java.util.Map.Entry<String, Integer> entry = iterator.next(key);
          if(entry.getKey().compareTo(key)!=0){
            log.error("Local string not in global dictionary! Cannot be without any other error => Abort!");
            System.exit(0);
          }
          mapping[local_id] = entry.getValue();
          local_id++;
        }

        iterator.close();

        // local trie is not needed any more!
        trie.release();
        // map all 6 different initial runs of this trie
        final Thread[] threads = new Thread[6];
        for(int primaryPos=0; primaryPos<3; primaryPos++) {
          final int other_condition1 = (primaryPos==0)?1:0;
          final int other_condition2 = (primaryPos==2)?1:2;
          final String prefixFilename = dir + FastRDF3XIndexConstruction.map[primaryPos] + "_Run_"+runNumber+"_";
          threads[primaryPos*2] = new LocalToGlobalIdMapper(prefixFilename + FastRDF3XIndexConstruction.map[other_condition1] + FastRDF3XIndexConstruction.map[other_condition2], mapping);
          threads[primaryPos*2].start();
          threads[primaryPos*2 + 1] = new LocalToGlobalIdMapper(prefixFilename + FastRDF3XIndexConstruction.map[other_condition2] + FastRDF3XIndexConstruction.map[other_condition1], mapping);
          threads[primaryPos*2 + 1].start();
        }

        // wait for the six threads for finishing their job (otherwise maybe too much memory consumption)
        for(final Thread thread: threads){
          thread.join();
        }

        runNumber++;
      }

      // merge initial runs...
      int size = 0;
      for(int primaryPos=0; primaryPos<3; primaryPos++) {
        final int other_condition1 = (primaryPos==0)?1:0;
        final int other_condition2 = (primaryPos==2)?1:2;
        size = FastRDF3XIndexConstruction.mergeRuns(dir, listOfTries.size(), primaryPos, other_condition1, other_condition2);
        size = FastRDF3XIndexConstruction.mergeRuns(dir, listOfTries.size(), primaryPos, other_condition2, other_condition1);
      }

      // generate indices (evaluation indices plus histogram indices)
      final SixIndices indices = new SixIndices(defaultGraphs.iterator().next());

      for(int primaryPos=0; primaryPos<3; primaryPos++) {
        final int other_condition1 = (primaryPos==0)?1:0;
        final int other_condition2 = (primaryPos==2)?1:2;
        final CollationOrder order1 = CollationOrder.valueOf(FastRDF3XIndexConstruction.map[primaryPos] + FastRDF3XIndexConstruction.map[other_condition1] + FastRDF3XIndexConstruction.map[other_condition2]);
        final String prefixFilename = dir + FastRDF3XIndexConstruction.map[primaryPos] + "_Final_Run_";
        indices.generate(order1, new GeneratorFromFinalRun(prefixFilename + FastRDF3XIndexConstruction.map[other_condition1] + FastRDF3XIndexConstruction.map[other_condition2], size, primaryPos, other_condition1, other_condition2));
        indices.generateStatistics(order1);

        final CollationOrder order2 = CollationOrder.valueOf(FastRDF3XIndexConstruction.map[primaryPos] + FastRDF3XIndexConstruction.map[other_condition2] + FastRDF3XIndexConstruction.map[other_condition1]);
        indices.generate(order2, new GeneratorFromFinalRun(prefixFilename + FastRDF3XIndexConstruction.map[other_condition2] + FastRDF3XIndexConstruction.map[other_condition1], size, primaryPos, other_condition2, other_condition1));
        indices.generateStatistics(order2);
      }

      indices.constructCompletely();

      // write out index info

      final OutputStream out = new BufferedOutputStream(new FileOutputStream(writeindexinfo));

      BufferManager.getBufferManager().writeAllModifiedPages();

      OutHelper.writeLuposInt(lupos.datastructures.paged_dbbptree.DBBPTree.getCurrentFileID(), out);

      ((lupos.datastructures.paged_dbbptree.DBBPTree) ((StringIntegerMapJava) LazyLiteral.getHm()).getOriginalMap()).writeLuposObject(out);
      ((StringArray) LazyLiteral.getV()).writeLuposStringArray(out);
      OutHelper.writeLuposInt(1, out);
      LiteralFactory.writeLuposLiteral(defaultGraphs.iterator().next(), out);
      indices.writeIndexInfo(out);
      OutHelper.writeLuposInt(0, out);
      out.close();
      final Date end = new Date();
      log.debug("_______________________________________________________________");
      log.info("Done, RDF3X index constructed!");
      log.debug("End time: {}", end);

      log.debug("Used time: {}", new TimeInterval(start, end));
      log.debug("Number of imported triples: {}", indices.getIndex(CollationOrder.SPO).size());
    } catch (final Exception e) {
      log.error(e.getMessage(), e);
    }
  }

  /**
   * merges the initial runs into a final run...
   * @param prefixFilename
   * @param numberOfRuns
   * @param primaryPos
   * @param secondaryPos
   * @param tertiaryPos
   * @return
   * @throws IOException
   */
  public static int mergeRuns(final String prefixFilename, final int numberOfRuns, final int primaryPos, final int secondaryPos, final int tertiaryPos) throws IOException {
    int size = 0;
    // use a heap for merging the smallest elements of the runs
    final Heap<HeapElementContainer> mergeHeap = Heap.createInstance(numberOfRuns, true, HEAPTYPE.OPTIMIZEDSEQUENTIAL);
    final IteratorFromRun[] iterators = new IteratorFromRun[numberOfRuns];
    for(int i=0; i<numberOfRuns; i++) {
      iterators[i] = new IteratorFromRun(prefixFilename + FastRDF3XIndexConstruction.map[primaryPos] + "_Run_"+i+"_"+ FastRDF3XIndexConstruction.map[secondaryPos] + FastRDF3XIndexConstruction.map[tertiaryPos] + "_mapped");
      final int[] first = iterators[i].next();
      if(first!=null){
        mergeHeap.add(new HeapElementContainer(first, i));
      }
    }

    final FinalRunWriter out = new FinalRunWriter(prefixFilename + FastRDF3XIndexConstruction.map[primaryPos] + "_Final_Run_"+ FastRDF3XIndexConstruction.map[secondaryPos] + FastRDF3XIndexConstruction.map[tertiaryPos]);
    int[] lastTriple = null;
    while(!mergeHeap.isEmpty()) {
      final HeapElementContainer element = mergeHeap.pop();
      final int run = element.getRun();
      final int[] toBeAdded = iterators[run].next();
      if(toBeAdded!=null){
        mergeHeap.add(new HeapElementContainer(toBeAdded, run));
      }
      final int[] triple = element.getTriple();
      if(lastTriple == null || lastTriple[0]!=triple[0] || lastTriple[1]!=triple[1] || lastTriple[2]!=triple[2]) {
        // store triple in final run
        out.write(triple);
        lastTriple = triple;
        size++;
      }
    }
    out.close();

    // delete all intermediate runs
    for(int i=0; i<numberOfRuns; i++) {
      FileHelper.deleteFile(prefixFilename + FastRDF3XIndexConstruction.map[primaryPos] + "_Run_"+i+"_"+ FastRDF3XIndexConstruction.map[secondaryPos] + FastRDF3XIndexConstruction.map[tertiaryPos] + "_mapped");
    }

    return size;
  }

  /**
   * for creating a 'local' dictionary for each initial run and creating id-triples of the initial runs according to the local dictionary
   */
  public static class CreateLocalDictionaryAndLocalIds implements TripleConsumer{

    private final RBTrieMap<Integer> map = new RBTrieMap<Integer>();
    private final int[][] blockOfIdTriples = new int[FastRDF3XIndexConstruction.LIMIT_TRIPLES_IN_MEMORY][];
    private int index = 0;
    private int runNumber = 0;
    private final List<TrieSet> listOfTries = new LinkedList<TrieSet>();
    private final String dir;

    public CreateLocalDictionaryAndLocalIds(final String dir){
      this.dir = dir;
    }

    @Override
    public void consume(final Triple triple) {
      final int[] idtriple = new int[3];
      int i=0;
      for(final Literal literal: triple){
        idtriple[i] = this.insertIntoMap(literal.toString());
        if(literal.originalStringDiffers()){
          this.insertIntoMap(literal.originalString());
        }
        i++;
      }
      this.blockOfIdTriples[this.index] = idtriple;
      this.index++;
      if(this.index>=FastRDF3XIndexConstruction.LIMIT_TRIPLES_IN_MEMORY){
        this.endOfBlock();
      }
    }

    private int insertIntoMap(final String value) {
      Integer code = this.map.get(value);
      if(code==null) {
        code = this.map.size();
        this.map.put(value, code);
      }
      return code;
    }

    public void endOfBlock(){
      if(this.index==0){
        return;
      }
      // create mapping preliminary id of triples => local id of triples, which reflects the order
      final int[] mapping = new int[this.map.size()];
      int local_id = 0;
      for(final Entry<String, Integer> entry: this.map) {
        mapping[entry.getValue()] = local_id;
        local_id++;
      }
      // apply mapping to id triples
      for(int i=0; i<this.index; i++) {
        final int[] triple = this.blockOfIdTriples[i];
        for(int j=0; j<3; j++) {
          triple[j] = mapping[triple[j]];
        }
      }
      // write out patricia trie
      final DBSeqTrieSet disk_set = new DBSeqTrieSet(this.dir+"Set_"+this.runNumber);
      try {
        disk_set.copy(this.map);
        this.listOfTries.add(disk_set);
      } catch (final TrieNotCopyableException e) {
        log.error(e.getMessage(), e);
      }
      // free resources of map in main memory
      this.map.clear();

      // sort id triples according to six collation orders and write them out as runs (in parallel)...
      final Thread threadS = new CountingSorter(this.blockOfIdTriples, this.index, 0, this.dir + "S_Run_"+this.runNumber+"_", mapping.length);
      threadS.start();
      final Thread threadP = new CountingSorter(this.blockOfIdTriples, this.index, 1, this.dir + "P_Run_"+this.runNumber+"_", mapping.length);
      threadP.start();
      final Thread threadO = new CountingSorter(this.blockOfIdTriples, this.index, 2, this.dir + "O_Run_"+this.runNumber+"_", mapping.length);
      threadO.start();

      try {
        threadS.join();
        threadP.join();
        threadO.join();
      } catch (final InterruptedException e) {
        log.error(e.getMessage(), e);
      }

      this.runNumber++;
      this.index = 0;
    }

    public List<TrieSet> getTries(){
      return this.listOfTries;
    }
  }

  /**
   * Sort the id-triples of the initial runs with counting sort according to the primary sort condition,
   * use quicksort to afterwards sort it according to the secondary and tertiary sort criteria
   * (e.g. first sort according to S, P and O, afterwards sort according to SPO and SOP based on the
   * sorting according to S, PSO and POS based on P, and OSP and OPS based on O.)
   */
  public static class CountingSorter extends Thread {

    private final int[][] blockOfIdTriples;
    private final int end;
    private final int pos;
    private final String filePrefix;
    private final int max_code;

    public CountingSorter(final int[][] blockOfIdTriples, final int end, final int pos, final String filePrefix, final int max_code) {
      this.blockOfIdTriples = blockOfIdTriples;
      this.end = end;
      this.pos = pos;
      this.filePrefix = filePrefix;
      this.max_code = max_code;
    }

    @Override
    public void run(){
      // start counting sort
      final int[] numberOfOccurences = new int[this.max_code];
      int numberOfBorders = 0;
      for(int i=0; i<this.end; i++){
        final int index = this.blockOfIdTriples[i][this.pos];
        if(numberOfOccurences[index]==0){
          numberOfBorders++;
        }
        numberOfOccurences[index]++;
      }
      // calculate addresses and borders...
      final int[] borders = new int[numberOfBorders];
      int index_borders = 0;
      if(numberOfOccurences[0]!=0) {
        borders[0] = numberOfOccurences[0];
        index_borders = 1;
      }
      for(int i=0; i < this.max_code - 1; i++){
        final boolean flag = (numberOfOccurences[i+1]>0);
        numberOfOccurences[i+1] = numberOfOccurences[i] + numberOfOccurences[i+1];
        if(flag) {
          borders[index_borders] = numberOfOccurences[i+1];
          index_borders++;
        }
      }
      // do sorting
      final int[][] blockOfSortedIdTriples = new int[this.end][];
      for(int i=0; i<this.end; i++){
        final int key = this.blockOfIdTriples[i][this.pos];
        blockOfSortedIdTriples[numberOfOccurences[key] - 1]=this.blockOfIdTriples[i];
        numberOfOccurences[key]--;
      }
      // now we have to do sorting according to the secondary and tertiary condition
      final int other_condition1 = (this.pos==0)?1:0;
      final int other_condition2 = (this.pos==2)?1:2;

      final Thread thread1 = new SecondaryConditionSorter(blockOfSortedIdTriples, this.pos, other_condition1, other_condition2, borders, this.filePrefix);
      thread1.start();
      final Thread thread2 = new SecondaryConditionSorter(blockOfSortedIdTriples, this.pos, other_condition2, other_condition1, borders, this.filePrefix);
      thread2.start();
      try {
        thread1.join();
        thread2.join();
      } catch (final InterruptedException e) {
        log.error(e.getMessage(), e);
      }
    }
  }

  /**
   * Sort routines for sorting according to the secondary and tertiary sort criterium...
   */
  public static class SecondaryConditionSorter extends Thread {

    private final int[][] blockOfSortedIdTriples;
    private final int primary_pos;
    private final int secondary_pos;
    private final int tertiary_pos;
    private final int[] borders;
    private final String fileName;

    public SecondaryConditionSorter(final int[][] blockOfSortedIdTriples, final int primary_pos, final int secondary_pos, final int tertiary_pos, final int[] borders, final String filePrefix) {
      this.blockOfSortedIdTriples = blockOfSortedIdTriples;
      this.primary_pos = primary_pos;
      this.secondary_pos = secondary_pos;
      this.tertiary_pos = tertiary_pos;
      this.borders = borders;
      this.fileName = filePrefix + FastRDF3XIndexConstruction.map[this.secondary_pos] + FastRDF3XIndexConstruction.map[this.tertiary_pos] ;
    }

    @Override
    public void run() {
      final int[][] blockOfFinallySortedIdTriples = new int[this.blockOfSortedIdTriples.length][];
      final ExecutorService executor = Executors.newFixedThreadPool(FastRDF3XIndexConstruction.NUMBER_OF_THREADS);
      int last_index = 0;
      for(int i=0; i<this.borders.length; i++){
        final int current_end = this.borders[i];
        executor.submit(new BasicSorter(this.blockOfSortedIdTriples, blockOfFinallySortedIdTriples, this.secondary_pos, this.tertiary_pos, last_index, current_end));
        last_index = current_end;
      }
      executor.shutdown();
      try {
        executor.awaitTermination(10, TimeUnit.DAYS);
      } catch (final InterruptedException e) {
        log.error(e.getMessage(), e);
      };

      // write out initial run:
      try {
        final OutputStream out = new BufferedOutputStream(new FileOutputStream(this.fileName));
        // write run in a compressed way:
        int start = 0;
        int previousPrimaryCode = 0;
        for(int i=0; i<this.borders.length; i++) {
          final int end = this.borders[i];
          previousPrimaryCode = FastRDF3XIndexConstruction.writeBlock(blockOfFinallySortedIdTriples, start, end, this.primary_pos, this.secondary_pos, this.tertiary_pos, previousPrimaryCode, out);
          start = end;
        }
        out.close();
      } catch (final FileNotFoundException e) {
        log.error(e.getMessage(), e);
      } catch (final IOException e) {
        log.error(e.getMessage(), e);
      }
    }
  }

  /**
   * Write out a 'block'. A block has the same id at the primary position, and is sorted according to the secondary and tertiary sort criteria.
   *
   * @param block
   * @param start
   * @param end
   * @param primaryPos
   * @param secondaryPos
   * @param tertiaryPos
   * @param previousPrimaryCode
   * @param out
   * @return
   * @throws IOException
   */
  private static int writeBlock(final int[][] block, final int start, final int end, final int primaryPos, final int secondaryPos, final int tertiaryPos, final int previousPrimaryCode, final OutputStream out) throws IOException{
    final int primaryCode = block[start][primaryPos];
    // difference encoding for the primary position
    OutHelper.writeLuposIntVariableBytes(primaryCode - previousPrimaryCode, out);

    // how many times this primary code is repeated?
    OutHelper.writeLuposIntVariableBytes(end-start, out);

    int previousSecondaryCode = 0;
    int previousTertiaryCode = 0;
    for(int j=start; j<end; j++) {
      final int secondaryCode = block[j][secondaryPos];
      // use difference encoding also for the secondary position
      final int differenceSecondaryPosition = secondaryCode - previousSecondaryCode;

      OutHelper.writeLuposIntVariableBytes(differenceSecondaryPosition, out);
      if(differenceSecondaryPosition>0){
        // difference encoding cannot be used for the tertiary position, as the secondary position changed
        previousTertiaryCode = 0;
      }
      final int tertiaryCode = block[j][tertiaryPos];
      OutHelper.writeLuposIntVariableBytes(tertiaryCode - previousTertiaryCode, out);

      previousSecondaryCode = secondaryCode;
      previousTertiaryCode = tertiaryCode;
    }
    return primaryCode;
  }

  /**
   * Quicksort for sorting a 'block' according to the secondary and tertiary sort criteria.
   * A 'block' is a set of triples with the same id at the primary position.
   */
  public static class BasicSorter extends Thread {

    private final int[][] blockOfSortedIdTriples;
    private final int secondary_pos;
    private final int tertiary_pos;
    private final int[][] blockOfFinallySortedIdTriples;
    private final int start;
    private final int end;

    public BasicSorter(final int[][] blockOfSortedIdTriples, final int[][] blockOfFinallySortedIdTriples, final int secondary_pos, final int tertiary_pos, final int start, final int end) {
      this.blockOfSortedIdTriples = blockOfSortedIdTriples;
      this.blockOfFinallySortedIdTriples = blockOfFinallySortedIdTriples;
      this.secondary_pos = secondary_pos;
      this.tertiary_pos = tertiary_pos;
      this.start = start;
      this.end = end;
    }

    @Override
    public void run(){
      System.arraycopy(this.blockOfSortedIdTriples, this.start, this.blockOfFinallySortedIdTriples, this.start, this.end - this.start);
      this.quicksort(this.start, this.end - 1);
    }

    private void quicksort(final int low, final int high) {
      int i = low, j = high;
      // Get the pivot element from the middle of the list
      final int[] pivot = this.blockOfFinallySortedIdTriples[low + (high-low)/2];

      // Divide into two lists
      while (i <= j) {
        // If the current value from the left list is smaller then the pivot
        // element then get the next element from the left list.
        // Check primary and tertiary sort condition!
        while (this.blockOfFinallySortedIdTriples[i][this.secondary_pos] < pivot[this.secondary_pos] ||
            (this.blockOfFinallySortedIdTriples[i][this.secondary_pos] == pivot[this.secondary_pos] &&
                this.blockOfFinallySortedIdTriples[i][this.tertiary_pos] < pivot[this.tertiary_pos])) {
          i++;
        }
        // If the current value from the right list is larger then the pivot
        // element then get the next element from the right list.
        // Check primary and tertiary sort condition!
        while (this.blockOfFinallySortedIdTriples[j][this.secondary_pos] > pivot[this.secondary_pos] ||
            (this.blockOfFinallySortedIdTriples[j][this.secondary_pos] == pivot[this.secondary_pos] &&
            this.blockOfFinallySortedIdTriples[j][this.tertiary_pos] > pivot[this.tertiary_pos])) {
          j--;
        }

        // If we have found a values in the left list which is larger then
        // the pivot element and if we have found a value in the right list
        // which is smaller then the pivot element then we exchange the
        // values.
        // As we are done we can increase i and j
        if (i <= j) {
          final int[] tmp = this.blockOfFinallySortedIdTriples[i];
          this.blockOfFinallySortedIdTriples[i] = this.blockOfFinallySortedIdTriples[j];
          this.blockOfFinallySortedIdTriples[j] =tmp;
          i++;
          j--;
        }
      }
      // Recursion
      if (low < j){
        this.quicksort(low, j);
      }
      if (i < high){
        this.quicksort(i, high);
      }
    }
  }

  /**
   * Maps the initial runs with their local ids to the global ids of the global dictionary
   */
  public static class LocalToGlobalIdMapper extends Thread {

    private final String filename;
    private final int[] mapping;

    public LocalToGlobalIdMapper(final String filename, final int[] mapping) {
      this.filename = filename;
      this.mapping = mapping;
    }

    @Override
    public void run() {
      try {
        final InputStream in = new BufferedInputStream(new FileInputStream(this.filename));
        final OutputStream out = new BufferedOutputStream(new FileOutputStream(this.filename + "_mapped"));

        int previousPrimaryCode = 0;
        int previousMappedPrimaryCode = 0;
        Integer primaryCode;
        while((primaryCode = InputHelper.readLuposIntVariableBytes(in)) != null){
          primaryCode+=previousPrimaryCode;
          final int primaryMappedCode = this.mapping[primaryCode];
          OutHelper.writeLuposIntVariableBytes(primaryMappedCode - previousMappedPrimaryCode, out);

          final int repetitions = InputHelper.readLuposIntVariableBytes(in);
          OutHelper.writeLuposIntVariableBytes(repetitions, out);

          int previousSecondaryCode = 0;
          int previousMappedSecondaryCode = 0;
          int previousTertiaryCode = 0;
          int previousMappedTertiaryCode = 0;

          for(int i=0; i<repetitions; i++) {
            final int secondaryCode = InputHelper.readLuposIntVariableBytes(in) + previousSecondaryCode;
            final int secondaryMappedCode = this.mapping[secondaryCode];
            OutHelper.writeLuposIntVariableBytes(secondaryMappedCode - previousMappedSecondaryCode, out);
            if(secondaryMappedCode != previousMappedSecondaryCode) {
              previousTertiaryCode = 0;
              previousMappedTertiaryCode = 0;
            }

            final int tertiaryCode = InputHelper.readLuposIntVariableBytes(in) + previousTertiaryCode;
            final int tertiaryMappedCode = this.mapping[tertiaryCode];
            OutHelper.writeLuposIntVariableBytes(tertiaryMappedCode - previousMappedTertiaryCode, out);

            previousMappedSecondaryCode = secondaryMappedCode;
            previousSecondaryCode = secondaryCode;
            previousMappedTertiaryCode = tertiaryMappedCode;
            previousTertiaryCode = tertiaryCode;
          }

          previousPrimaryCode = primaryCode;
          previousMappedPrimaryCode = primaryMappedCode;
        }

        out.close();
        in.close();
        FileHelper.deleteFile(this.filename);
      } catch (final FileNotFoundException e) {
        log.error(e.getMessage(), e);
      } catch (final IOException e) {
        log.error(e.getMessage(), e);
      }
    }
  }

  /**
   * Just for iterating through a run
   */
  public static class IteratorFromRun {

    private final InputStream in;

    int previousPrimaryCode = 0;
    int previousSecondaryCode = 0;
    int previousTertiaryCode = 0;
    int leftWithSamePrimaryCode = 0;

    public IteratorFromRun(final String filename) throws EOFException, FileNotFoundException, IOException {
      this.in = new BufferedInputStream(new FileInputStream(filename));
    }

    public int[] next() throws IOException {
      if(this.leftWithSamePrimaryCode==0) {
        final Integer code = InputHelper.readLuposIntVariableBytes(this.in);
        if(code == null){
          this.in.close();
          return null;
        }
        this.previousPrimaryCode += code;
        this.previousSecondaryCode = 0;
        this.previousTertiaryCode = 0;
        this.leftWithSamePrimaryCode = InputHelper.readLuposIntVariableBytes(this.in);
      }
      final int code = InputHelper.readLuposIntVariableBytes(this.in);
      if(code > 0) {
        this.previousTertiaryCode = 0;
      }
      this.previousSecondaryCode += code;
      this.previousTertiaryCode += InputHelper.readLuposIntVariableBytes(this.in);
      this.leftWithSamePrimaryCode--;
      return new int[] { this.previousPrimaryCode, this.previousSecondaryCode, this.previousTertiaryCode};
    }
  }

  /**
   * This is just for generating B+-trees from the final run
   */
  public static class GeneratorFromFinalRun implements Generator<TripleKey, Triple>{

    private final String filename;
    private final int size;
    private int primaryMap;
    private int secondaryMap;
    private int tertiaryMap;
    private final CollationOrder order;

    public GeneratorFromFinalRun(final String filename, final int size, final int primaryPos, final int secondaryPos, final int tertiaryPos) {
      this.filename = filename;
      this.size = size;
      switch(primaryPos){
        default:
        case 0:
          this.primaryMap = 0;
          break;
        case 1:
          this.secondaryMap = 0;
          break;
        case 2:
          this.tertiaryMap = 0;
          break;
      }
      switch(secondaryPos){
        default:
        case 0:
          this.primaryMap = 1;
          break;
        case 1:
          this.secondaryMap = 1;
          break;
        case 2:
          this.tertiaryMap = 1;
          break;
      }
      switch(tertiaryPos){
        default:
        case 0:
          this.primaryMap = 2;
          break;
        case 1:
          this.secondaryMap = 2;
          break;
        case 2:
          this.tertiaryMap = 2;
          break;
      }
      this.order = CollationOrder.createCollationOrder(primaryPos, secondaryPos);
    }

    @Override
    public int size() {
      return this.size;
    }

    @Override
    public Iterator<Entry<TripleKey, Triple>> iterator() {
      try {
        return new ImmutableIterator<Entry<TripleKey, Triple>>(){

          IteratorFromRun it = new IteratorFromRun(GeneratorFromFinalRun.this.filename);
          int[] triple = this.it.next();

          @Override
          public boolean hasNext() {
            return this.triple!=null;
          }

          @Override
          public Entry<TripleKey, Triple> next() {
            if(this.triple==null){
              return null;
            }
            final Triple t = new Triple(new LazyLiteral(this.triple[GeneratorFromFinalRun.this.primaryMap]),
                new LazyLiteral(this.triple[GeneratorFromFinalRun.this.secondaryMap]),
                new LazyLiteral(this.triple[GeneratorFromFinalRun.this.tertiaryMap]));

            final TripleKey key = new TripleKey(t, GeneratorFromFinalRun.this.order);

            try {
              this.triple = this.it.next();

            } catch (final IOException e) {
              log.error(e.getMessage(), e);
              this.triple = null;
            }

            return new MapEntry<TripleKey, Triple>(key, t);
          }
        };
      } catch (final IOException e) {
        log.error(e.getMessage(), e);
      }
      return null;
    }

  }

  /**
   * This is a container for merging the initial runs...
   */
  public static class HeapElementContainer implements Comparable<HeapElementContainer> {

    private final int[] triple;
    private final int run;

    public HeapElementContainer(final int[] triple, final int run){
      this.triple= triple;
      this.run = run;
    }

    @Override
    public final int compareTo(final HeapElementContainer o) {
      for(int pos=0; pos<3; pos++) {
        final int compare = this.triple[pos] - o.triple[pos];
        if(compare!=0){
          return compare;
        }
      }
      return 0;
    }

    public final int[] getTriple() {
      return this.triple;
    }

    public final int getRun() {
      return this.run;
    }
  }

  /**
   * This is just for writing the final run
   */
  public static class FinalRunWriter {

    private final static int MAX = 8 * 1024;
    private final int[][] block = new int[FinalRunWriter.MAX][];;
    private int current = 0;
    private int previousPrimaryCode = 0;
    private final OutputStream out;

    public FinalRunWriter(final String filename) throws FileNotFoundException, IOException {
      this.out = new BufferedOutputStream(new FileOutputStream(filename));
    }

    public void write(final int[] triple) {
      if(this.current==FinalRunWriter.MAX || (this.current>0 && this.block[this.current-1][0]!=triple[0])) {
        this.writeBlock();
      }
      this.block[this.current] = triple;
      this.current++;
    }

    private void writeBlock() {
      if(this.current>0) {
        try {
          this.previousPrimaryCode = FastRDF3XIndexConstruction.writeBlock(this.block, 0, this.current, 0, 1, 2, this.previousPrimaryCode, this.out);
        } catch (final IOException e) {
          log.error(e.getMessage(), e);
        }
        this.current = 0;
      }
    }

    public void close() throws IOException {
      this.writeBlock();
      this.out.close();
    }
  }
}
TOP

Related Classes of lupos.engine.indexconstruction.FastRDF3XIndexConstruction

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.