Package org.apache.solr.handler.extraction

Examples of org.apache.solr.handler.extraction.SolrContentHandler


      Metadata metadata = new Metadata();
      for (Entry<String, Object> entry : record.getFields().entries()) {
        metadata.add(entry.getKey(), entry.getValue().toString());
      }

      SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
      try {
        inputStream = TikaInputStream.get(inputStream);

        ContentHandler parsingHandler = handler;

        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
        }

        try {
          parser.parse(inputStream, parsingHandler, metadata, parseContext);
        } catch (IOException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (SAXException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (TikaException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        }
      } finally {
        if (inputStream != null) {
          Closeables.closeQuietly(inputStream);
        }
      }
     
      SolrInputDocument doc = handler.newDocument();
      LOG.debug("solr doc: {}", doc);     
      Record outputRecord = toRecord(doc);
      return getChild().process(outputRecord);
    }
View Full Code Here


    // which will cause the ContentHandler to be invoked.
    metadata.set(fieldName, getFoobarWithNonChars());
    StripNonCharSolrContentHandlerFactory contentHandlerFactory =
      new StripNonCharSolrContentHandlerFactory(DateUtil.DEFAULT_DATE_FORMATS);
    IndexSchema schema = h.getCore().getLatestSchema();
    SolrContentHandler contentHandler =
      contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
    SolrInputDocument doc = contentHandler.newDocument();
    String foobar = doc.getFieldValue(fieldName).toString();
    assertTrue("foobar".equals(foobar));
  }
View Full Code Here

      Metadata metadata = new Metadata();
      for (Entry<String, Object> entry : record.getFields().entries()) {
        metadata.add(entry.getKey(), entry.getValue().toString());
      }

      SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
      try {
        inputStream = TikaInputStream.get(inputStream);

        ContentHandler parsingHandler = handler;
        StringWriter debugWriter = null;
        if (LOG.isTraceEnabled()) {
          debugWriter = new StringWriter();
          ContentHandler serializer = new XMLSerializer(debugWriter, new OutputFormat("XML", "UTF-8", true));
          parsingHandler = new TeeContentHandler(parsingHandler, serializer);
        }

        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
        }

        try {
          parser.parse(inputStream, parsingHandler, metadata, parseContext);
        } catch (IOException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (SAXException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (TikaException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        }
       
        LOG.trace("debug XML doc: {}", debugWriter);
      } finally {
        if (inputStream != null) {
          Closeables.closeQuietly(inputStream);
        }
      }
     
      SolrInputDocument doc = handler.newDocument();
      LOG.debug("solr doc: {}", doc);     
      Record outputRecord = toRecord(doc);
      return getChild().process(outputRecord);
    }
View Full Code Here

    // which will cause the ContentHandler to be invoked.
    metadata.set(fieldName, getFoobarWithNonChars());
    StripNonCharSolrContentHandlerFactory contentHandlerFactory =
      new StripNonCharSolrContentHandlerFactory(DateUtil.DEFAULT_DATE_FORMATS);
    IndexSchema schema = h.getCore().getLatestSchema();
    SolrContentHandler contentHandler =
      contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
    SolrInputDocument doc = contentHandler.newDocument();
    String foobar = doc.getFieldValue(fieldName).toString();
    assertTrue("foobar".equals(foobar));
  }
View Full Code Here

TOP

Related Classes of org.apache.solr.handler.extraction.SolrContentHandler

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.