Examples of org.apache.solr.handler.extraction.SolrContentHandler

Package org.apache.solr.handler.extraction

Examples of org.apache.solr.handler.extraction.SolrContentHandler

org.apache.solr.handler.extraction.SolrContentHandler
The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s. This class is not thread-safe.

User's may wish to override this class to provide their own functionality. @see org.apache.solr.handler.extraction.SolrContentHandlerFactory @see org.apache.solr.handler.extraction.ExtractingRequestHandler @see org.apache.solr.handler.extraction.ExtractingDocumentLoader

      Metadata metadata = new Metadata();
      for (Entry<String, Object> entry : record.getFields().entries()) {
        metadata.add(entry.getKey(), entry.getValue().toString());
      }


      SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
      try {
        inputStream = TikaInputStream.get(inputStream);


        ContentHandler parsingHandler = handler;


        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
        }


        try {
          parser.parse(inputStream, parsingHandler, metadata, parseContext);
        } catch (IOException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (SAXException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (TikaException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        }
      } finally {
        if (inputStream != null) {
          Closeables.closeQuietly(inputStream);
        }
      }
      
      SolrInputDocument doc = handler.newDocument();
      LOG.debug("solr doc: {}", doc);      
      Record outputRecord = toRecord(doc);
      return getChild().process(outputRecord);
    }

View Full Code Here

    // which will cause the ContentHandler to be invoked.
    metadata.set(fieldName, getFoobarWithNonChars());
    StripNonCharSolrContentHandlerFactory contentHandlerFactory =
      new StripNonCharSolrContentHandlerFactory(DateUtil.DEFAULT_DATE_FORMATS);
    IndexSchema schema = h.getCore().getLatestSchema();
    SolrContentHandler contentHandler =
      contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
    SolrInputDocument doc = contentHandler.newDocument();
    String foobar = doc.getFieldValue(fieldName).toString();
    assertTrue("foobar".equals(foobar));
  }

View Full Code Here

      Metadata metadata = new Metadata();
      for (Entry<String, Object> entry : record.getFields().entries()) {
        metadata.add(entry.getKey(), entry.getValue().toString());
      }


      SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
      try {
        inputStream = TikaInputStream.get(inputStream);


        ContentHandler parsingHandler = handler;
        StringWriter debugWriter = null;
        if (LOG.isTraceEnabled()) {
          debugWriter = new StringWriter();
          ContentHandler serializer = new XMLSerializer(debugWriter, new OutputFormat("XML", "UTF-8", true));
          parsingHandler = new TeeContentHandler(parsingHandler, serializer);
        }


        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
        }


        try {
          parser.parse(inputStream, parsingHandler, metadata, parseContext);
        } catch (IOException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (SAXException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (TikaException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        }
        
        LOG.trace("debug XML doc: {}", debugWriter);
      } finally {
        if (inputStream != null) {
          Closeables.closeQuietly(inputStream);
        }
      }
      
      SolrInputDocument doc = handler.newDocument();
      LOG.debug("solr doc: {}", doc);      
      Record outputRecord = toRecord(doc);
      return getChild().process(outputRecord);
    }

View Full Code Here

    // which will cause the ContentHandler to be invoked.
    metadata.set(fieldName, getFoobarWithNonChars());
    StripNonCharSolrContentHandlerFactory contentHandlerFactory =
      new StripNonCharSolrContentHandlerFactory(DateUtil.DEFAULT_DATE_FORMATS);
    IndexSchema schema = h.getCore().getLatestSchema();
    SolrContentHandler contentHandler =
      contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
    SolrInputDocument doc = contentHandler.newDocument();
    String foobar = doc.getFieldValue(fieldName).toString();
    assertTrue("foobar".equals(foobar));
  }

View Full Code Here

TOP

Related Classes of org.apache.solr.handler.extraction.SolrContentHandler

com.cloudera.cdk.morphline.solrcell.SolrCellBuilder$SolrCell

com.cloudera.cdk.morphline.solrcell.SolrCellMorphlineTest

org.apache.solr.common.SolrInputDocument

org.apache.solr.schema.SchemaField

org.kitesdk.morphline.solrcell.SolrCellBuilder$SolrCell

org.kitesdk.morphline.solrcell.SolrCellMorphlineTest

java.text.DateFormat

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.