Package org.apache.stanbol.entityhub.indexing

Source Code of org.apache.stanbol.entityhub.indexing.Urify$ReaderDaemon

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.entityhub.indexing;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
* Implemented to allow importing the Musicbrainz dump to Jena TDB.<p>
*
* The problem is that RDF dumps with blank nodes do require to store a
* lookup table for the black nodes IDs during import.<p> In case a dump
* contains millions of such nodes this table does no longer fit into
* memory. This makes importing Dumps to with the Jena RDF parser impossible.
* <p>
* This Utility can replaces nodes that start with "_:{id}" with
* "<{prefix}{id}>. The prefix must be set with the "-p" parameter.
* <p>
* This tool supports "gz" and "bz2" compressed files. If the output will use
* the same compression as the input. It uses two threads for "reading/processing"
* and "writing". It could also process multiple files in parallel. However this
* feature is not yet activated as the Musicbrainz dump comes in a single file.
*
* @author Rupert Westenthaler
*
*/
public class Urify implements Runnable{

    private static final String EOF_INDICATOR = "__EOF_INDICATOR";
   
    private static Logger log = LoggerFactory.getLogger(Urify.class);
   
    private static final Options options;
    static {
        options = new Options();
        options.addOption("h", "help", false, "display this help and exit");
        options.addOption("p","prefix",true,
            "The URI prefix used for wrapping the bNode Id");
        options.addOption("e","encoding",true, "the char encodinf (default: UTF-8)");
    }
    /**
     * @param args
     * @throws ParseException
     */
    public static void main(String[] args) throws IOException, ParseException {
        CommandLineParser parser = new PosixParser();
        CommandLine line = parser.parse(options, args);
        args = line.getArgs();
        if(!line.hasOption('p')){
            log.error("Missing parameter 'prefix' ('p)!");
            System.exit(1);
        }
        String prefix = "<"+line.getOptionValue('p');
        log.info("prfix: {} ",line.getOptionValue('p'));
        Charset charset;
        if(line.hasOption('e')){
            charset = Charset.forName(line.getOptionValue('e'));
            if(charset == null){
                log.error("Unsupported encoding '{}'!",line.getOptionValue('e'));
                System.exit(1);
            }
        } else {
            charset = Charset.forName("UTF-8");
        }
        log.info("charset: {} ",charset.name());
        Urify urify = new Urify(Arrays.asList(args), charset, prefix);
        urify.run(); //TODO: this could support processing multiple files in parallel
    }

    private final Charset charset;
    private final String prefix;
    protected long start = System.currentTimeMillis();
    protected long uf_count = 0;

    private List<String> resources;

    public Urify(List<String> resources, Charset charset, String prefix) throws IOException {
        this.charset = charset;
        this.prefix = prefix;
        this.resources = Collections.synchronizedList(new ArrayList<String>(resources));
    }
   
    public void run() {
        String source;
        do {
            synchronized (resources) {
                if(resources.isEmpty()){
                    source = null;
                } else {
                    source = resources.remove(0);
                    try {
                        urify(source);
                    } catch (Exception e) {
                        log.error("Unable to Urify "+resources,e);
                    }
                }
            }
        } while (source != null);
    }
    private void urify(String resource) throws IOException {
        File source = new File(resource);
        if(source.isFile()){
            String path = FilenameUtils.getFullPathNoEndSeparator(resource);
            String name = FilenameUtils.getName(resource);
            File target = new File(path,"uf_"+name);
            int i=0;
            while(target.exists()){
                i++;
                target = new File(path,"uf"+i+"_"+name);
            }
            InputStream is = new FileInputStream(source);
            OutputStream os = new FileOutputStream(target);
            log.info("Resource: {}",resource);
            log.info("Target  : {}",target);
            if ("gz".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
                is = new GZIPInputStream(is);
                os = new GZIPOutputStream(os);
                name = FilenameUtils.removeExtension(name);
                log.debug("   - from GZIP Archive");
            } else if ("bz2".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
                is = new BZip2CompressorInputStream(is);
                os = new BZip2CompressorOutputStream(os);
                name = FilenameUtils.removeExtension(name);
                log.debug("   - from BZip2 Archive");
            }// TODO: No Zip File support
            //else no complression
            BlockingQueue<String> queue = new ArrayBlockingQueue<String>(1000);
            ReaderDaemon reader = new ReaderDaemon(new BufferedReader(new InputStreamReader(is, charset)), queue);
            WriterDaemon writer = new WriterDaemon(
                new BufferedWriter(new OutputStreamWriter(os, charset)), queue);
            Thread readerDaemon = new Thread(reader,name+" reader");
            Thread writerDaemon = new Thread(writer,name+" writer");
            readerDaemon.setDaemon(true);
            writerDaemon.setDaemon(true);
            writerDaemon.start();
            readerDaemon.start();
            Object notifier = writer.getNotifier();
            synchronized (notifier) { //wait until processed
               if(!writer.completed()){
                   try {
                       notifier.wait();
                   } catch (InterruptedException e) {/*ignore*/}
               }
            }
            if(reader.getError() != null){
                throw new IOException("Error while reading source "+source,reader.getError());
            }
            if(writer.getError() != null){
                throw new IOException("Error while writing resource "+target,writer.getError());
            }
            log.info(" ... completed resource {}",resource);
        } else {
            throw new FileNotFoundException("Parsed File "+resource+" does not exist or is not a File!");
        }
    }

    private class ReaderDaemon implements Runnable {
        private final BufferedReader reader;
        private final BlockingQueue<String> queue;

        private Exception error;
       
        protected ReaderDaemon(BufferedReader reader, BlockingQueue<String> queue){
            this.reader = reader;
            this.queue = queue;
        }
       
       
        public Exception getError() {
            return error;
        }


        @Override
        public void run() {
            String triple;
            try {
                while((triple = reader.readLine()) != null){
                    StringBuilder sb = new StringBuilder();
                    for(String node : triple.split(" ")){
                        if(node.startsWith("_:")){ //convert to uri
                            sb.append(prefix);
                            sb.append(node.substring(2,node.length()));
                            sb.append("> ");
                            uf_count++;
                        } else {
                            sb.append(node);
                            if(node.length()>1){
                                //the final '.' is also a node
                                sb.append(" ");
                            }
                        }
                    }
                    queue.put(sb.toString());
                }
            } catch (IOException e) {
                error = e;
            } catch (InterruptedException e) {
                error = e;
            } finally {
                IOUtils.closeQuietly(reader);
                try {
                    queue.put(EOF_INDICATOR); //indicates finished
                } catch (InterruptedException e) {
                    log.error("Unable to put EOF to queue!",e);
                }
            }
        }
    }
   
    private class WriterDaemon implements Runnable {
        private final BufferedWriter writer;
        private final BlockingQueue<String> queue;
        private final Object notifier = new Object();
        private boolean completed = false;

        private Exception error;
       
        protected WriterDaemon(BufferedWriter writer, BlockingQueue<String> queue){
            this.writer = writer;
            this.queue = queue;
        }
       
        public Exception getError() {
            return error;
        }

        public Object getNotifier() {
            return notifier;
        }

        public boolean completed() {
            return completed;
        }

        @Override
        public void run() {
            String triple;
            long count = 0;
            boolean first = true;
            try {
                while(!EOF_INDICATOR.equals((triple = queue.take()))){
                    if(count % 1000000 == 0){
                        //NOTE: urified will not be correct as it is counted
                        //      by an other thread, but for logging ...
                        long end = System.currentTimeMillis();
                        log.info("processed {} | urified: {} (batch: {}sec)",
                            new Object[]{count,uf_count,((double)(end-start))/1000});
                        start = end;
                    }
                    count++;
                    if(first){
                        first = false;
                    } else {
                        writer.write("\n");
                    }
                    writer.write(triple);
                   
                }
            } catch (InterruptedException e) {
                this.error = e;
            } catch (IOException e) {
                this.error = e;
            } finally {
                IOUtils.closeQuietly(writer);
                this.completed = true;
                synchronized (notifier) {
                    notifier.notifyAll();
                }
            }
        }
    }
}
TOP

Related Classes of org.apache.stanbol.entityhub.indexing.Urify$ReaderDaemon

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.