Package lupos.rdf.parser

Source Code of lupos.rdf.parser.Parser

/**
* Copyright (c) 2013, Institute of Information Systems (Sven Groppe and contributors of LUPOSDATE), University of Luebeck
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
* following conditions are met:
*
*   - Redistributions of source code must retain the above copyright notice, this list of conditions and the following
*     disclaimer.
*   - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
*     following disclaimer in the documentation and/or other materials provided with the distribution.
*   - Neither the name of the University of Luebeck nor the names of its contributors may be used to endorse or promote
*     products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package lupos.rdf.parser;

import lupos.datastructures.items.Triple;
import lupos.datastructures.items.literal.Literal;
import lupos.datastructures.items.literal.LiteralFactory;
import lupos.datastructures.items.literal.URILiteral;
import lupos.engine.evaluators.CommonCoreQueryEvaluator;
import lupos.engine.operators.tripleoperator.TripleConsumer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.URISyntaxException;
import java.util.HashMap;

public abstract class Parser {

  private static final Logger log = LoggerFactory.getLogger(Parser.class);

  private int counter = 0;

  private static int maxTriples = 0;

  private BufferedReader reader = null;

  public int getTripleNumber() {
    return this.counter - 1;
  }

  public void parse(final InputStream in, final TripleConsumer tc,
      final String encoding) throws UnsupportedEncodingException {
    try {
      this.reader = new BufferedReader(new InputStreamReader(in, encoding));
      Triple t = this.nextTriple();
      // System.out.println(t);
      // Triple last=t;
      // String lastLine=line;
      // int lastPos=pos;
      while (t != null) {
        // last=t;
        // lastLine=line;
        // lastPos=pos;
        // System.out.println(t);
        tc.consume(t);
        t = this.nextTriple();
        // if (YagoParser.maxTriples > 0
        // && counter > YagoParser.maxTriples - 1)
        // System.out.println("Last triple:" + t);
        if (t != null
            && (t.getSubject() == null || t.getPredicate() == null || t
                .getSubject() == null)) {
          log.debug("Triple:" + t);
          log.debug("Line:" + this.line + "###############pos:"
              + this.pos);
        }

        // System.out.println(t);
      }
      this.prefixe = null;
      log.debug("Line:" + this.line + "###############pos:" + this.pos/*
                                     * +"\nlast triple:"
                                     * +
                                     * last+
                                     * "\nlast line:"
                                     * +
                                     * lastLine
                                     * +
                                     * "############pos:"
                                     * +
                                     * lastPos
                                     */);
    } catch (final EOFException e) {
      this.prefixe = null;
    } finally {
      try {
        if (this.reader != null) {
          this.reader.close();
        }
      } catch (final IOException ioe) {
        ioe.printStackTrace();
      }
    }
  }

  private Literal subject = null;
  private Literal predicate = null;

  private Triple nextTriple() throws EOFException {
    this.counter++;
    if (CommonCoreQueryEvaluator.printNumberOfTriples && this.counter % 1000000 == 0)
     {
      log.error("#Triples:" + this.counter); // in order to display,
    }
    // but not be logged
    if (Parser.maxTriples > 0 && this.counter > Parser.maxTriples) {
      return null;
    }
    Literal nextLiteral;
    nextLiteral = this.nextLiteral();
    if (nextLiteral == null) {
      return null;
    }
    if(this.predicate != null) {
      return new Triple(this.subject, this.predicate, nextLiteral);
    }
    Literal nextLiteral2;
    nextLiteral2 = this.nextLiteral();
    if (this.subject != null) {
      this.predicate = nextLiteral;
      return new Triple(this.subject, nextLiteral, nextLiteral2);
    }
    this.subject = nextLiteral;
    this.predicate = nextLiteral2;
    Literal object;
    object = this.nextLiteral();
    return new Triple(nextLiteral, nextLiteral2, object);
  }

  private String line = null;
  private int pos = -1;
  private int linenumber = 0;

  protected char nextCharacter() throws EOFException {
    if (this.backFlag) {
      this.backFlag = false;
      return this.back;
    }
    if (this.line == null) {
      try {
        this.line = this.reader.readLine();
        this.linenumber++;
      } catch (final IOException e) {
        log.error(e.toString(),e);
        e.printStackTrace();
      }
    }
    this.pos++;
    if (this.line != null && this.pos >= this.line.length()) {
      try {
        this.line = this.reader.readLine();
        this.linenumber++;
        this.pos = -1;
        return '\n';
      } catch (final IOException e) {
        log.error(e.toString(),e);
      }
    }
    if (this.line == null) {
      throw new EOFException();
    }
    return this.line.charAt(this.pos);
  }

  protected HashMap<String, String> prefixe = new HashMap<String, String>();

  private char back;
  private boolean backFlag = false;

  protected abstract char handlePrefix() throws EOFException;

  private Literal nextLiteral() throws EOFException {
    char next = this.jumpOverBlanks();
    while (next == '@') {
      next = this.handlePrefix();
    }
    if (next == '.') {
      this.subject = null;
      this.predicate = null;
      next = this.jumpOverBlanks();
    }
    if (next == ';') {
      this.predicate = null;
      next = this.jumpOverBlanks();
    }
    if (next == ',') {
      next = this.jumpOverBlanks();
    }
    if (next == '\"') {
      // String!
      String s = "" + next;
      boolean marked = false;
      do {
        marked = (!marked && next == '\\');
        next = this.nextCharacter();
        s += next;
      } while (next != '\"' || marked);
      next = this.jumpOverBlanks();
      if (next == '^') {
        // typed literal!
        next = this.nextCharacter();
        if (next != '^') {
          log.error("Typed literal not recognized!");
          return null;
        }
        final Literal datatype = this.nextLiteral();
        try {
          return LiteralFactory.createTypedLiteralWithoutLazyLiteral(
              s, (URILiteral) datatype);
        } catch (final URISyntaxException e) {
          log.error(e.toString(),e);
          return null;
        }
      } if(next =='@') { // language-tagged literal
        String lang = "";
        while (!Parser.isSeparator(next)) {
          lang += next;
          next = this.nextCharacter();
        }
        return LiteralFactory.createLanguageTaggedLiteralWithoutLazyLiteral(s, lang);
      } else {
        this.back = next;
        this.backFlag = true;
        return LiteralFactory.createLiteralWithoutLazyLiteral(s);
      }
    }
    if (next == '<') {
      // IRI!
      final StringBuilder s = new StringBuilder("<");
      boolean marked = false;
      do {
        marked = (!marked && next == '\\');
        next = this.nextCharacter();
        // Provide escaping for some characters the SPARQL parser will not like in IRIs!
        // This is an automatic escaping which may confuse some users.
        // However, it is the only way such that LazyLiterals will work properly for iris with spaces and other (forbidden) characters which occur in real-world data
        switch(next){
          case ' ':
            s.append("%20");
            break;
          case '\n':
            s.append("%0A");
            break;
          case '\r':
            s.append("%0D");
            break;
          case '<':
            s.append("%3C");
            break;
          default:
            s.append(next);
            break;
        }
      } while (next != '>' || marked);
      try {
        return LiteralFactory.createURILiteralWithoutLazyLiteral(s.toString());
      } catch (final URISyntaxException e) {
        log.error(e.toString(),e);
        return null;
      }
    }
    if (next == '_') {
      String s = "" + next;
      next = this.nextCharacter();
      if (next == ':') {
        while (!Parser.isSeparator(next)) {
          s += next;
          next = this.nextCharacter();
        }

        this.back = next;
        this.backFlag = true;
        return LiteralFactory
            .createAnonymousLiteralWithoutLazyLiteral(s);
      } else {
        return null;
      }
    }
    // qualified uri!
    String namespace = "";
    String postfix = "";
    if (next == '>') {
      // qualified uri in the format >prefix:postfix<!
      next = this.nextCharacter();
      while (next != ':') {
        namespace += next;
        next = this.nextCharacter();
      }
      next = this.nextCharacter();
      while (next != '<' && next != '\n') {
        postfix += next;
        next = this.nextCharacter();
      }
    } else {
      // is it rdf:type???
      if(next=='a'){
        namespace += next;
        next = this.nextCharacter();
        if(Parser.isSeparator(next)){
          // rdf:type recognized!
          try {
            return LiteralFactory.createURILiteralWithoutLazyLiteral("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>");
          } catch (final URISyntaxException e) {
            log.error(e.toString(),e);
          }
        }
      }

      // qualified uri in the format prefix:postfix!
      while (next != ':') {
        namespace += next;
        next = this.nextCharacter();
      }
      next = this.nextCharacter();
      while (!Parser.isSeparator(next)) {
        postfix += next;
        next = this.nextCharacter();
      }
    }

    this.back = next;
    this.backFlag = true;

    try {
      // System.out.println(">>"+prefixe.get(namespace)+postfix+"<<");
      if (this.prefixe.get(namespace) == null) {
        log.debug("Prefix:" + namespace);
        log.debug("Postfix:" + postfix);
        log.debug("Position in line:" + this.pos);
        log.debug("Line Number:" + this.linenumber + " Line:" + this.line);
      }
      return LiteralFactory.createURILiteralWithoutLazyLiteral("<"
          + this.prefixe.get(namespace) + postfix + ">");
    } catch (final URISyntaxException e) {
      log.error(e.toString(),e);
      return null;
    }
  }

  protected final static boolean isSeparator(final char next){
    return (next == ' ' || next == '.' || next == ',' || next == ';' || next == '"' || next == '<' || next == '\n' || next == '\t');
  }

  protected char jumpOverBlanks() throws EOFException {
    char next = this.nextCharacter();
    while (next == ' ' || next == '\n' || next == '\t'){
      next = this.nextCharacter();
    }
    if(next == '#'){ // jump over comments
      while(next!='\n'){
        next = this.nextCharacter();
      }
      return this.jumpOverBlanks();
    }
    return next;
  }

  public static int getMaxTriples() {
    return Parser.maxTriples;
  }

  public static void setMaxTriples(final int maxTriples) {
    Parser.maxTriples = maxTriples;
  }
}
TOP

Related Classes of lupos.rdf.parser.Parser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.