Package org.apache.droids.solr

Source Code of org.apache.droids.solr.AdvancedSolrHandler$SolrContentHandler

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.droids.solr;

import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Handler;
import org.apache.droids.exception.DroidsException;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrInputDocument;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;

/**
* A Droids Handler which allows to specify selectors to store
* documents' parts in a Solr index.
*
* A selector is an Entry made of a key which matches the solr fiel and
* of a value which correspond to a path selector.
*
* Path selectors are always absolute and supports indexes.
*
* Here are some examples:
*   - /html[0]/div[0]
*   - /html[0]/div[0]/p[0]
*   - /html[0]/div[1]/p[2]
*/
public class AdvancedSolrHandler implements Handler {

  /**
   * A solr server
   */
  private SolrServer server;
 
  /**
   * The selectors allow to save specific parts of the document in the index.
   * The HashMap's key matches the Solr field.
   * The HashMap's value is an absolute path corresponding to an element.
   */
  private Map<String, String> selectors;
 
  /**
   * A content handler
   */
  private SolrContentHandler contentHandler = new SolrContentHandler(selectors);

  /**
   * An HTML parser
   */
  private SAXParser parser;
 
  /**
   * @return the current solr server
   */
  public SolrServer getServer() {
    return server;
  }

  /**
   * @param solr a solr server
   */
  public void setServer(SolrServer solr) {
    this.server = solr;
  }

  /**
   * @return the current path selectors
   */
  public Map<String, String> getSelectors() {
    return selectors;
  }

  /**
   * @param selectors an hash map containing path selectors
   */
  public void setSelectors(HashMap<String, String> selectors) {
    contentHandler.initPatterns(selectors);
    this.selectors = selectors;
  }

  /*
   * @see org.apache.droids.api.Handler#handle(java.net.URI, org.apache.droids.api.ContentEntity)
   */
    @Override
  public void handle(URI uri, ContentEntity entity) throws IOException, DroidsException {
    SolrInputDocument doc = createSolrInputDocument(uri, entity);
    try {
      server.add(doc);
    } catch (SolrServerException e) {
      throw new DroidsException(e);
    }
  }

  /**
   * Generates a SolrInputDocument from an URI and a ContentEntity
   * which correspond to the document which need to be saved in the index
   *
   * @param uri an uri
   * @param entity an entity
   * @return
   */
  private SolrInputDocument createSolrInputDocument(URI uri, ContentEntity entity) {
    SolrInputDocument doc = new SolrInputDocument();

    doc.setField("id", uri.getPath());
    doc.setField("name", uri.toASCIIString());
    doc.setField("host", uri.getHost());
    doc.setField("mime", entity.getMimeType());
    doc.setField("content", entity.getParse().getText());
   
    if (parser == null) initParser();
   
    if (!selectors.isEmpty()) {
      contentHandler.initDocument(doc);
      try {
        parser.setContentHandler(contentHandler);
        parser.parse(new InputSource(entity.obtainContent()));
      } catch (IOException e) {
        e.printStackTrace();
      } catch (SAXException e) {
        e.printStackTrace();
      }
    }
   
    return doc;
  }

  /**
   * Initialize a Cyber Necko parser configured to return lower case element's names
   *
   * @return
   */
  private SAXParser initParser() {
    parser = new SAXParser();
    try {
      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
      parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
    } catch (SAXNotRecognizedException ex) {
      throw new IllegalStateException(ex);
    } catch (SAXNotSupportedException ex) {
      throw new IllegalStateException(ex);
    }
    return parser;
  }

  /**
   * A class that implements a SAX ContentHandler and uses patterns to record documents
   * elements in a SolrInputDocuement.
   */
  private class SolrContentHandler implements ContentHandler {

    private SolrInputDocument doc;
   
    /**
     * the patterns which match element's path
     */
    private Map<String, Pattern> patterns = new HashMap<String, Pattern>();

    /**
     * stores the values which match the patterns
     */
    private Map<String, String> valueRecorders = new HashMap<String, String>();

    /**
     * A two dimensional stack used to store the current path
     */
    private Stack<Stack<String>> path = new Stack<Stack<String>>();

    private int level = 0;

    private int lastLevel = 0;
   
    /**
     * Constructor
     *
     * @param selectors an Map which contains selectors
     */
    public SolrContentHandler(Map<String, String> selectors) {
      initPatterns(selectors);
    }
   
    /**
     * Initialize patterns
     * @param selectors
     */
    public void initPatterns(Map<String, String> selectors) {
      if (selectors != null) {
       
        // clear the current patterns
        patterns.clear();
       
        // pattern for the element and its index
        final Pattern p = Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$");
 
        // for each selector
        Set<String> keys = selectors.keySet();
        for (String key : keys) {
          // creating a pattern
          String regex = "^";
          String selector = selectors.get(key);
          String[] elements = selector.split("/");
          // which match all the elements and their respective indices
          for (String element : elements) {
            Matcher m = p.matcher(element);
            if (m.find()) {
              String elementName = m.group(1);
              String elementIndex = m.group(3);
              regex += "/" + elementName;
              if (elementIndex == null) {
                regex += "\\[[0-9]*\\]";
              } else {
                regex += "\\[" + elementIndex + "\\]";
              }
            }
          }
          regex += "$";
 
          // storing the new Pattern
          Pattern pattern = Pattern.compile(regex);
          patterns.put(key, pattern);
        }
      }
    }
   
    /**
     * Initialization of the document used for indexation
     *
     * @param doc a solr document
     */
    public void initDocument(SolrInputDocument doc) {
      this.doc = doc;
    }

    /*
     * @see org.xml.sax.ContentHandler#startDocument()
     */
    @Override
    public void startDocument() throws SAXException {
      level = 0;
    }

    /*
     * @see org.xml.sax.ContentHandler#endDocument()
     */
    @Override
    public void endDocument() throws SAXException {
      level = 0;
    }

    /*
     * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
     */
    @Override
    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {     
      // set the level properties.
      level++;

      // go down in the hierarchy of elements.
      if (level == lastLevel && path.size() > 0) {
        path.get(path.size() - 1).add(localName);
      } else if (level > lastLevel) {
        Stack<String> s = new Stack<String>();
        s.add(localName);
        path.add(s);
      }

      // if the path matches a pattern, starts recording the matching content.
      String path = getCurrentPath();
      Iterator<Entry<String, Pattern>> entries = patterns.entrySet().iterator();
      while (entries.hasNext()) {
        Entry<String, Pattern> entry = entries.next();
        String patternName = entry.getKey();
        Pattern patternValue = entry.getValue();
        Matcher matcher = patternValue.matcher(path);
        if (matcher.find()) {
          valueRecorders.put(patternName, "");
        }
      }

    }

    /*
     * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
     */
    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
      // check if we climb in the hierarchy.
      if (level < lastLevel && path.size() > 0) {
        path.pop();
      }
     
      // set the level properties.
      lastLevel = level;
      level--;
     
      // if the path matches a selector, stores the matching content.
      String path = getCurrentPath();
      Iterator<Entry<String, Pattern>> entries = patterns.entrySet().iterator();
      while (entries.hasNext()) {
        Entry<String, Pattern> entry = entries.next();
        String patternName = entry.getKey();
        Pattern patternValue = entry.getValue();
        Matcher matcher = patternValue.matcher(path);
        if (matcher.find()) {
          // add the matching content to the solr document.
          String value = valueRecorders.remove(patternName);
          doc.addField(patternName, value);
        }
      }
    }

    /*
     * @see org.xml.sax.ContentHandler#characters(char[], int, int)
     */
    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
      // store the content in each recorder
      Set<String> keys = valueRecorders.keySet();
      for (String key : keys) {
        String recorder = valueRecorders.get(key);
        recorder += new String(Arrays.copyOfRange(ch, start, start + length));
        valueRecorders.put(key, recorder);
      }
    }
   
    /*
     * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int)
     */
    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
      characters(ch, start, length);
    }

    /*
     * @see org.xml.sax.ContentHandler#startPrefixMapping(java.lang.String, java.lang.String)
     */
    @Override
    public void startPrefixMapping(String prefix, String uri) throws SAXException {

    }

    /*
     * @see org.xml.sax.ContentHandler#endPrefixMapping(java.lang.String)
     */
    @Override
    public void endPrefixMapping(String prefix) throws SAXException {

    }

    /*
     * @see org.xml.sax.ContentHandler#processingInstruction(java.lang.String, java.lang.String)
     */
    @Override
    public void processingInstruction(String target, String data) throws SAXException {

    }

    /*
     * @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
     */
    @Override
    public void setDocumentLocator(Locator locator) {

    }

    /*
     * @see org.xml.sax.ContentHandler#skippedEntity(java.lang.String)
     */
    @Override
    public void skippedEntity(String name) throws SAXException {

    }

    /**
     * Computes the current path by crossing the path stack.
     *
     * @return a path
     */
    private String getCurrentPath() {
      String p = "";

      // find the element at each level
      for (Stack<String> h : path) {
        String element = h.get(h.size() - 1);
        Integer index = -1;
        // find the element's index
        for (String e : h) {
          if (e.equals(element)) {
            index++;
          }
        }
        // path with the index at each level
        p += "/" + element + "[" + index + "]";
      }
      return p;
    }

  }
}
TOP

Related Classes of org.apache.droids.solr.AdvancedSolrHandler$SolrContentHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.