Package cmd

Source Code of cmd.tdbloader4

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cmd;

import static org.apache.jena.tdbloader4.Constants.OUTPUT_PATH_POSTFIX_1;
import static org.apache.jena.tdbloader4.Constants.OUTPUT_PATH_POSTFIX_2;
import static org.apache.jena.tdbloader4.Constants.OUTPUT_PATH_POSTFIX_3;
import static org.apache.jena.tdbloader4.Constants.OUTPUT_PATH_POSTFIX_4;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.jena.tdbloader4.Constants;
import org.apache.jena.tdbloader4.FirstDriver;
import org.apache.jena.tdbloader4.FourthDriver;
import org.apache.jena.tdbloader4.SecondDriver;
import org.apache.jena.tdbloader4.ThirdDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.hp.hpl.jena.graph.Graph;
import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.graph.Triple;
import com.hp.hpl.jena.sparql.core.DatasetGraph;
import com.hp.hpl.jena.tdb.TDBFactory;
import com.hp.hpl.jena.tdb.TDBLoader;
import com.hp.hpl.jena.tdb.base.file.Location;
import com.hp.hpl.jena.tdb.store.DatasetGraphTDB;
import com.hp.hpl.jena.tdb.sys.SetupTDB;
import com.hp.hpl.jena.tdb.transaction.DatasetGraphTransaction;
import com.hp.hpl.jena.util.iterator.ExtendedIterator;

public class tdbloader4 extends Configured implements Tool {

    private static final Logger log = LoggerFactory.getLogger(tdbloader4.class);
 
    public tdbloader4 () {
    super();
        log.debug("constructed with no configuration.");
  }

  public tdbloader4 (Configuration configuration) {
    super(configuration);
        log.debug("constructed with configuration.");
  }
   
  @Override
  public int run(String[] args) throws Exception {
    if ( args.length != 2 ) {
      System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }
   
    Configuration configuration = getConf();
    configuration.set(Constants.RUN_ID, String.valueOf(System.currentTimeMillis()));
        boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
        boolean copyToLocal = configuration.getBoolean(Constants.OPTION_COPY_TO_LOCAL, Constants.OPTION_COPY_TO_LOCAL_DEFAULT);
        boolean verify = configuration.getBoolean(Constants.OPTION_VERIFY, Constants.OPTION_VERIFY_DEFAULT);
        boolean runLocal = configuration.getBoolean(Constants.OPTION_RUN_LOCAL, Constants.OPTION_RUN_LOCAL_DEFAULT);
       
        FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
        if ( overrideOutput ) {
            fs.delete(new Path(args[1]), true);
            fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_1), true);
            fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_2), true);
            fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_3), true);
            fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_4), true);
        }
       
        if ( ( copyToLocal ) || ( runLocal ) ) {
          File path = new File(args[1]);
          path.mkdirs();
        }
   
        Tool first = new FirstDriver(configuration);
        int status = first.run(new String[] { args[0], args[1] + OUTPUT_PATH_POSTFIX_1 });
        if (status != 0){ return status ;}

        createOffsetsFile(fs, args[1] + OUTPUT_PATH_POSTFIX_1, args[1] + OUTPUT_PATH_POSTFIX_1);
        Path offsets = new Path(args[1] + OUTPUT_PATH_POSTFIX_1, Constants.OFFSETS_FILENAME);
        DistributedCache.addCacheFile(offsets.toUri(), configuration);
       
        Tool second = new SecondDriver(configuration);
        status = second.run(new String[] { args[0], args[1] + OUTPUT_PATH_POSTFIX_2 });
        if (status != 0){ return status ;}

        Tool third = new ThirdDriver(configuration);
        status = third.run(new String[] { args[1] + OUTPUT_PATH_POSTFIX_2, args[1] + OUTPUT_PATH_POSTFIX_3 });
        if (status != 0){ return status ;}

        Tool fourth = new FourthDriver(configuration);
        status = fourth.run(new String[] { args[1] + OUTPUT_PATH_POSTFIX_3, args[1] + OUTPUT_PATH_POSTFIX_4 });
        if (status != 0){ return status ;}

        if ( copyToLocal ) {
          Tool download = new download(configuration);
          download.run(new String[] { args[1] + OUTPUT_PATH_POSTFIX_2, args[1] + OUTPUT_PATH_POSTFIX_4, args[1] });
        }
       
        if ( verify ) {
            DatasetGraphTDB dsgMem = load(args[0]);
            Location location = new Location(args[1]);
           
            if (!copyToLocal) {
              // TODO: this is a sort of a cheat and it could go away (if it turns out to be too slow)!
              download.fixNodeTable2(location);
            }

            DatasetGraphTDB dsgDisk = SetupTDB.buildDataset(location) ;
            boolean isomorphic = isomorphic ( dsgMem, dsgDisk );
            System.out.println ("> " + isomorphic);
        }
       
    return status;
  }

  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new tdbloader4(), args);
    System.exit(exitCode);
  }
 
  private void createOffsetsFile(FileSystem fs, String input, String output) throws IOException {
    log.debug("Creating offsets file...");
        Map<Long, Long> offsets = new TreeMap<Long, Long>();
        FileStatus[] status = fs.listStatus(new Path(input));
        for (FileStatus fileStatus : status) {
          Path file = fileStatus.getPath();
          if ( file.getName().startsWith("part-r-") ) {
            log.debug("Processing: {}", file.getName());
             BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(file)));
             String line = in.readLine();
             String[] tokens = line.split("\\s");
             long partition = Long.valueOf(tokens[0]);
                 long offset = Long.valueOf(tokens[1]);
                 log.debug("Partition {} has offset {}", partition, offset);
                 offsets.put(partition, offset);
          }
    }

        Path outputPath = new Path(output, Constants.OFFSETS_FILENAME);
        PrintWriter out = new PrintWriter(new OutputStreamWriter( fs.create(outputPath)));
        for (Long partition : offsets.keySet()) {
      out.println(partition + "\t" + offsets.get(partition));
    }
        out.close();
        log.debug("Offset file created.");
  }
 
    public static boolean isomorphic(DatasetGraph dsgMem, DatasetGraph dsgDisk) {
        if (!dsgMem.getDefaultGraph().isIsomorphicWith(dsgDisk.getDefaultGraph()))
            return false;
        Iterator<Node> graphsMem = dsgMem.listGraphNodes();
        Iterator<Node> graphsDisk = dsgDisk.listGraphNodes();
       
        Set<Node> seen = new HashSet<Node>();

        while (graphsMem.hasNext()) {
            Node graphNode = graphsMem.next();
            if (dsgDisk.getGraph(graphNode) == null) return false;
            if (!dsgMem.getGraph(graphNode).isIsomorphicWith(dsgDisk.getGraph(graphNode))) return false;
            seen.add(graphNode);
        }

        while (graphsDisk.hasNext()) {
            Node graphNode = graphsDisk.next();
            if (!seen.contains(graphNode)) {
                if (dsgMem.getGraph(graphNode) == null) return false;
                if (!dsgMem.getGraph(graphNode).isIsomorphicWith(dsgDisk.getGraph(graphNode))) return false;
            }
        }

        return true;
    }

    public static DatasetGraphTDB load(String inputPath) {
        List<String> urls = new ArrayList<String>();
        for (File file : new File(inputPath).listFiles()) {
            if (file.isFile()) {
                urls.add(file.getAbsolutePath());
            }
        }
        DatasetGraphTransaction dsg = (DatasetGraphTransaction)TDBFactory.createDatasetGraph();
        TDBLoader.load(dsg.getBaseDatasetGraph(), urls);

        return dsg.getBaseDatasetGraph();
    }
   
    public static String dump(DatasetGraph dsgMem, DatasetGraph dsgDisk) {
        StringBuffer sb = new StringBuffer();
        sb.append("\n");

        if (!dsgMem.getDefaultGraph().isIsomorphicWith(dsgDisk.getDefaultGraph())) {
            sb.append("Default graphs are not isomorphic [FAIL]\n");
            sb.append("    First:\n");
            dump(sb, dsgMem.getDefaultGraph());
            sb.append("    Second:\n");
            dump(sb, dsgDisk.getDefaultGraph());
        } else {
            sb.append("Default graphs are isomorphic [OK]\n");
        }

        Iterator<Node> graphsMem = dsgMem.listGraphNodes();
        Iterator<Node> graphsDisk = dsgDisk.listGraphNodes();
        Set<Node> seen = new HashSet<Node>();

        while (graphsMem.hasNext()) {
            Node graphNode = graphsMem.next();
            if (dsgDisk.getGraph(graphNode) == null) sb.append(graphNode + " is present only in one dataset. [FAIL]\n");
            if (!dsgMem.getGraph(graphNode).isIsomorphicWith(dsgDisk.getGraph(graphNode))) {
                sb.append("\n" + graphNode + " graphs are not isomorphic [FAIL]\n");
                sb.append("    First:\n");
                dump(sb, dsgMem.getGraph(graphNode));
                sb.append("    Second:\n");
                dump(sb, dsgDisk.getGraph(graphNode));
            }
            seen.add(graphNode);
        }

        while (graphsDisk.hasNext()) {
            Node graphNode = graphsDisk.next();
            if (!seen.contains(graphNode)) {
                if (dsgMem.getGraph(graphNode) == null) sb.append(graphNode + " is present only in one dataset. [FAIL]\n");
                if (!dsgMem.getGraph(graphNode).isIsomorphicWith(dsgDisk.getGraph(graphNode))) {
                    sb.append("\n" + graphNode + " graphs are not isomorphic [FAIL]\n");
                    sb.append("    First:\n");
                    dump(sb, dsgMem.getGraph(graphNode));
                    sb.append("    Second:\n");
                    dump(sb, dsgDisk.getGraph(graphNode));
                }
            }
        }

        return sb.toString();
    }
   
    private static void dump (StringBuffer sb, Graph graph) {
        ExtendedIterator<Triple> iter = graph.find(Node.ANY, Node.ANY, Node.ANY);
        while ( iter.hasNext() ) {
            Triple triple = iter.next();
            sb.append(triple).append("\n");
        }
    }
   
}
TOP

Related Classes of cmd.tdbloader4

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.