Source Code of org.sindice.siren.analysis.filter.URIDecodingFilter

/**
 * Copyright 2014 National University of Ireland, Galway.
 *
 * This file is part of the SIREn project. Project and contact information:
 *
 *  https://github.com/rdelbru/SIREn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.sindice.siren.analysis.filter;


import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.UnsupportedCharsetException;


import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;


/**
 * Decode the URI encoding format of special characters such as '?' or '<'.
 *
 * <p>
 *
 * Special characters (except of the SPACE that can be encoded with '+' and
 * '%20') begins with a '%' and are followed by two characters in hexadecimal
 * format. If a special character cannot be decoded, it is just skipped and the
 * decoding process just continue.
 *
 * <p>
 *
 * When a URI has special characters, two versions of the URI are produced
 * (both tokens have the same position):
 * <ul>
 * <li> the original URI </li>
 * <li> the decoded URI </li>
 * </ul>
 */
public class URIDecodingFilter extends TokenFilter {


  private final CharsetDecoder    charsetDecoder;
  private final ByteBuffer        decoded = ByteBuffer.allocate(32);


  private boolean                 modifiedURI = false;
  private CharBuffer              termBuffer;
  private int                     termLength;


  private final CharTermAttribute           termAtt;
  private final PositionIncrementAttribute  posIncrAtt;


  /**
   * Create a new URI decoding filter configured for the specified charset.
   *
   * @param input The input token stream
   * @param charsetEncoding The name of a supported character encoding.
   * @throws UnsupportedCharsetException if the character encoding is not supported or recognised.
   */
  public URIDecodingFilter(final TokenStream input, final String charsetEncoding)
  throws UnsupportedCharsetException {
    super(input);
    final Charset charset = this.lookupCharset(charsetEncoding);
    charsetDecoder = charset.newDecoder()
                            .onMalformedInput(CodingErrorAction.REPLACE)
                            .onUnmappableCharacter(CodingErrorAction.REPLACE);
    termAtt = this.addAttribute(CharTermAttribute.class);
    posIncrAtt = this.addAttribute(PositionIncrementAttribute.class);
    termBuffer = CharBuffer.allocate(256);
  }


  @Override
  public final boolean incrementToken()
  throws IOException {
    if (modifiedURI) { // Return the previously decoded URI
      modifiedURI = false;
      termAtt.setEmpty();
      termAtt.copyBuffer(termBuffer.array(), 0, termBuffer.position());
      posIncrAtt.setPositionIncrement(0);
      return true;
    }


    if (input.incrementToken()) {
      termLength = termAtt.length();
      this.updateBuffer();
      this.decode();
      return true;
    }
    return false;
  }


  /**
   * Check if the buffer is big enough
   */
  private void updateBuffer() {
    if (termBuffer.capacity() < termLength) {
      termBuffer = CharBuffer.allocate(termLength);
    }
    termBuffer.clear();
  }


  /**
   * look for the class of the given charset
   * @param csn
   * @throws UnsupportedCharsetException
   */
  private Charset lookupCharset(final String csn)
  throws UnsupportedCharsetException {
    if (Charset.isSupported(csn)) {
      return Charset.forName(csn);
    }
    throw new UnsupportedCharsetException(csn);
  }


  /**
   * Return the decimal value of an hexadecimal number. If it is not hexadecimal,
   * a negative value is returned.
   * @param c
   */
  private int hexaToInt(final char c) {
    switch (c) {
      case '0':
        return 0;
      case '1':
        return 1;
      case '2':
        return 2;
      case '3':
        return 3;
      case '4':
        return 4;
      case '5':
        return 5;
      case '6':
        return 6;
      case '7':
        return 7;
      case '8':
        return 8;
      case '9':
        return 9;
      case 'a':
        return 10;
      case 'b':
        return 11;
      case 'c':
        return 12;
      case 'd':
        return 13;
      case 'e':
        return 14;
      case 'f':
        return 15;
      case 'A':
        return 10;
      case 'B':
        return 11;
      case 'C':
        return 12;
      case 'D':
        return 13;
      case 'E':
        return 14;
      case 'F':
        return 15;
      default:
        /*
         * Return a negative value if the hexadecimal character is invalid.
         * Because it is < 0 and big enough, the character won't be decoded.
         */
        return -241;
    }
  }


  /**
   * Return the decimal value of an hexadecimal number, multiplied by 16.
   * If it is not hexadecimal, a negative value is returned.
   * @param c
   */
  private int hexaToInt2(final char c) {
    switch (c) {
      case '0':
        return 0;
      case '1':
        return 16;
      case '2':
        return 32;
      case '3':
        return 48;
      case '4':
        return 64;
      case '5':
        return 80;
      case '6':
        return 96;
      case '7':
        return 112;
      case '8':
        return 128;
      case '9':
        return 144;
      case 'a':
        return 160;
      case 'b':
        return 176;
      case 'c':
        return 192;
      case 'd':
        return 208;
      case 'e':
        return 224;
      case 'f':
        return 240;
      case 'A':
        return 160;
      case 'B':
        return 176;
      case 'C':
        return 192;
      case 'D':
        return 208;
      case 'E':
        return 224;
      case 'F':
        return 240;
      default:
        /*
         * Return a negative value if the hexadecimal character is invalid.
         * Because it is < 0 and big enough, the character won't be decoded.
         */
        return -241;
    }
  }


  /**
   * Partial decoding of URI encoded characters.
   * <br>
   * Ignore the '+' (SPACE) cases, as it does not
   * make sense to index URIs with a space. Nobody will be able to search them
   * as a space will be considered as a character delimitation.
   * <br>
   * Replace %20 by +, so that the URI can be tokenised easily (%20 causes
   * problem during tokenisation, while + does not).
   */
  private void decode() {
    char c;
    int i = 0;


    while (i < termLength) {
      c = termAtt.charAt(i);
      switch (c) {
      case '%': // Special character
        /*
         * Starting with this instance of %, process all consecutive substrings
         * of the form %xy. Each substring %xy will yield a byte. Convert all
         * consecutive  bytes obtained this way to whatever character(s) they
         * represent in the provided encoding.
         *
         * xy is a hexadecimal number.
         */
        modifiedURI = true;


        while (i + 2 < termLength && c == '%') {
          final char c1 = termAtt.charAt(i + 1);
          final char c2 = termAtt.charAt(i + 2);


          // The next two characters converted from a hex to a decimal value
          final int value = this.hexaToInt2(c1) + this.hexaToInt(c2);
          if (value == 32) { // replace the SPACE character, encoded by %20, by +
            this.decodeChars();
            termBuffer.put('+');
          } else if (value >= 0) { // Negative value are illegal. Just skip it.
            if (!decoded.hasRemaining()) { // No more place in the buffer, output what is already there.
              this.decodeChars();
            }
            decoded.put((byte) value);
          } else { // put the value back, without changing it
            this.decodeChars();
            termBuffer.put('%').put(c1).put(c2);
          }
          i += 3;
          if (i < termLength)
            c = termAtt.charAt(i);
        }
        // decode the chain of special characters
        this.decodeChars();
        // incomplete byte encoding (e.g., %x). Skip it.
        if (i < termLength && c == '%') {
          termBuffer.put('%');
          i++;
        }
        break;
      default:
        termBuffer.put(c);
        i++;
        break;
      }
    }
  }


  private void decodeChars() {
    final int limit = decoded.position();
    decoded.position(0);
    decoded.limit(limit);
    charsetDecoder.decode(decoded, termBuffer, true);
    decoded.clear();
  }


}
Source Code of org.sindice.siren.analysis.filter.URIDecodingFilter

Related Classes of org.sindice.siren.analysis.filter.URIDecodingFilter