Package org.apache.stanbol.enhancer.servicesapi

Examples of org.apache.stanbol.enhancer.servicesapi.ContentSink


   
    @Test
    public void testContentSink() throws IOException {
        String mt = "text/plain";
        Charset ISO8859_4 = Charset.forName("ISO-8859-4");
        ContentSink cs = contentItemFactory.createContentSink(mt+"; charset="+ISO8859_4.name());
        assertNotNull(cs);
        assertNotNull(cs.getBlob());
        OutputStream out = cs.getOutputStream();
        assertNotNull(cs);
        // multiple calls MUST return the same OutputStream!
        assertSame(out, cs.getOutputStream());
        //test mime type
        assertNotNull(cs.getBlob().getMimeType());
        //get MimeType MUST return the simple mime type
        assertEquals(mt, cs.getBlob().getMimeType());
        String charsetParam = cs.getBlob().getParameter().get("charset");
        assertNotNull("expected charset parameter is missing!",charsetParam);
        assertEquals(ISO8859_4.name(), charsetParam);
       
        //now write some data to the sink
        String TEST_CONTENT = "Thîs tésts wrîtîng to â ConténtSînk!";
        //note this uses the same charset as parsed as charset in the
        //constructor!
        IOUtils.write(TEST_CONTENT, cs.getOutputStream(),ISO8859_4.name());
        IOUtils.closeQuietly(cs.getOutputStream());
        //now read the data from the blob
        String content = IOUtils.toString(
            cs.getBlob().getStream(),
            charsetParam);
        assertEquals(TEST_CONTENT, content);
    }
View Full Code Here


    }

    @Test
    public void testContentSinkDefaultMimeType() throws IOException {
        String DEFAULT = "application/octet-stream";
        ContentSink cs = contentItemFactory.createContentSink(null);
        assertNotNull(cs);
        assertNotNull(cs.getBlob());
        assertNotNull(cs.getBlob().getMimeType());
        //get MimeType MUST return the simple mime type
        assertEquals(DEFAULT, cs.getBlob().getMimeType());
        assertNull(cs.getBlob().getParameter().get("charset"));
    }
View Full Code Here

            //also explicitly set the charset as contentEncoding
            String charset = mtas.mediaType.getParameters().get("charset");
            if(charset != null){
                metadata.set(Metadata.CONTENT_ENCODING, charset);
            }
            ContentSink plainTextSink;
            try {
                plainTextSink = ciFactory.createContentSink(TEXT_PLAIN +"; charset="+UTF8.name());
            } catch (IOException e) {
                IOUtils.closeQuietly(in); //close the input stream
                throw new EngineException("Error while initialising Blob for" +
                    "writing the text/plain version of the parsed content",e);
            }
            final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
            final ContentHandler textHandler = new BodyContentHandler( //only the Body
                new PlainTextHandler(plainTextWriter, false,skipLinebreaks)); //skip ignoreable
            final ToXMLContentHandler xhtmlHandler;
            final ContentHandler mainHandler;
            ContentSink xhtmlSink = null;
            try {
                if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
                    try {
                        xhtmlSink = ciFactory.createContentSink(XHTML +"; charset="+UTF8.name());
                    } catch (IOException e) {
                        throw new EngineException("Error while initialising Blob for" +
                                "writing the application/xhtml+xml version of the parsed content",e);
                    }
                    try {
                        xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(),UTF8.name());
                    } catch (UnsupportedEncodingException e) {
                        throw new EngineException("This system does not support the encoding "+UTF8,e);
                    }
                    mainHandler = new MultiHandler(textHandler,xhtmlHandler);
                } else {
                    mainHandler = textHandler;
                    xhtmlHandler = null;
                    xhtmlSink = null;
                }
                try {
                    AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
                        public Object run() throws IOException, SAXException, TikaException {
                            /*
                             * We need to replace the context Classloader with the Bundle ClassLoader
                             * to ensure that Singleton instances of XML frameworks (such as node4j)
                             * do not leak into the OSGI environment.
                             *
                             * Most Java XML libs prefer to load implementations by using the
                             * {@link Thread#getContextClassLoader()}. However OSGI has no control over
                             * this {@link ClassLoader}. Because of that there can be situations where
                             * Interfaces are loaded via the Bundle Classloader and the implementations
                             * are taken from the context Classloader. What can cause
                             * {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
                             *
                             * Setting the context Classloader to the Bundle classloader helps to avoid
                             * those situations.
                             */
                            ClassLoader contextClassLoader = updateContextClassLoader();
                            try {
                                parser.parse(in, mainHandler, metadata, context);
                            }finally {
                                //reset the previous context ClassLoader
                                Thread.currentThread().setContextClassLoader(contextClassLoader);
                            }
                            return null;
                        }
                    });
                } catch (PrivilegedActionException pae) {
                    Exception e = pae.getException();
                    if(e instanceof IOException || e instanceof SAXException || e instanceof TikaException){
                        throw new EngineException("Unable to convert ContentItem "+
                                ci.getUri()+" with mimeType '"+ci.getMimeType()+"' to "+
                                "plain text!",e);
                    } else { //runtime exception
                        throw RuntimeException.class.cast(e);
                    }
                }
            } finally { //ensure that the writers are closed correctly
                IOUtils.closeQuietly(in);
                IOUtils.closeQuietly(plainTextWriter);
                if(xhtmlSink != null){
                    IOUtils.closeQuietly(xhtmlSink.getOutputStream());
                }
            }
            String random = randomUUID().toString();
            UriRef textBlobUri = new UriRef("urn:tika:text:"+random);
            ci.addPart(textBlobUri, plainTextSink.getBlob());
            if(xhtmlHandler != null){
                UriRef xhtmlBlobUri = new UriRef("urn:tika:xhtml:"+random);
                ci.addPart(xhtmlBlobUri,  xhtmlSink.getBlob());
            }
            //add the extracted metadata
            if(log.isInfoEnabled()){
                for(String name : metadata.names()){
                    log.info("{}: {}",name,Arrays.toString(metadata.getValues(name)));
View Full Code Here

        if (null == m) {
            log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa",
                ci.getUri(),ci.getMimeType());
            return;
        }
        ContentSink plainTextSink;
        try {
            plainTextSink = ciFactory.createContentSink("text/plain");
        } catch (IOException e) {
            m.close();
            throw new EngineException("Unable to initialise Blob for storing" +
                "the plain text content",e);
        }
        HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();
        RDF2GoUtils.urifyBlankNodes(m);
        ClosableIterator<Statement> it = m.iterator();
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
            plainTextSink.getOutputStream(), UTF8));
        boolean textExtracted = false; //used to detect if some text was extracted
        try {
            MGraph g = new SimpleMGraph(); //first add to a temporary graph
            while (it.hasNext()) {
                Statement oneStmt = it.next();
                //we need to treat triples that provide the plain/text
                //version differently. Such Objects need to be added to
                //the plain text Blob!
                if(oneStmt.getSubject().equals(docId) &&
                        oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){
                    String text = oneStmt.getObject().toString();
                    if(text != null && !text.isEmpty()){
                        try {
                            out.write(oneStmt.getObject().toString());
                        } catch (IOException e) {
                            throw new EngineException("Unable to write extracted" +
                                "plain text to Blob (blob impl: "
                                    + plainTextSink.getBlob().getClass()+")",e);
                        }
                        textExtracted = true;
                        if (includeText) {
                            NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                            UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                            Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                            g.add(new TripleImpl(subject, predicate, object));
                        }
                    }
                } else { //add metadata to the metadata of the contentItem
                    NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                    UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                    Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);

                    if (null != subject && null != predicate && null != object) {
                        Triple t = new TripleImpl(subject, predicate, object);
                        g.add(t);
                        log.debug("added " + t.toString());
                    }
                }
            }
            //add the extracted triples to the metadata of the ContentItem
            ci.getLock().writeLock().lock();
            try {
                ci.getMetadata().addAll(g);
                g = null;
            } finally {
                ci.getLock().writeLock().unlock();
            }
        } finally {
            it.close();
            m.close();
            IOUtils.closeQuietly(out);
        }
        if(textExtracted){
            //add plain text to the content item
            UriRef blobUri = new UriRef("urn:metaxa:plain-text:"+randomUUID());
            ci.addPart(blobUri, plainTextSink.getBlob());
        }
    }
View Full Code Here

   
    @Test
    public void testContentSink() throws IOException {
        String mt = "text/plain";
        Charset ISO8859_4 = Charset.forName("ISO-8859-4");
        ContentSink cs = contentItemFactory.createContentSink(mt+"; charset="+ISO8859_4.name());
        assertNotNull(cs);
        assertNotNull(cs.getBlob());
        OutputStream out = cs.getOutputStream();
        assertNotNull(cs);
        // multiple calls MUST return the same OutputStream!
        assertSame(out, cs.getOutputStream());
        //test mime type
        assertNotNull(cs.getBlob().getMimeType());
        //get MimeType MUST return the simple mime type
        assertEquals(mt, cs.getBlob().getMimeType());
        String charsetParam = cs.getBlob().getParameter().get("charset");
        assertNotNull("expected charset parameter is missing!",charsetParam);
        assertEquals(ISO8859_4.name(), charsetParam);
       
        //now write some data to the sink
        String TEST_CONTENT = "Thîs tésts wrîtîng to â ConténtSînk!";
        //note this uses the same charset as parsed as charset in the
        //constructor!
        IOUtils.write(TEST_CONTENT, cs.getOutputStream(),ISO8859_4.name());
        IOUtils.closeQuietly(cs.getOutputStream());
        //now read the data from the blob
        String content = IOUtils.toString(
            cs.getBlob().getStream(),
            charsetParam);
        assertEquals(TEST_CONTENT, content);
    }
View Full Code Here

    }

    @Test
    public void testContentSinkDefaultMimeType() throws IOException {
        String DEFAULT = "application/octet-stream";
        ContentSink cs = contentItemFactory.createContentSink(null);
        assertNotNull(cs);
        assertNotNull(cs.getBlob());
        assertNotNull(cs.getBlob().getMimeType());
        //get MimeType MUST return the simple mime type
        assertEquals(DEFAULT, cs.getBlob().getMimeType());
        assertNull(cs.getBlob().getParameter().get("charset"));
    }
View Full Code Here

                in = mtas.in;
            }
            final Metadata metadata = new Metadata();
            //set the already parsed contentType
            metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
            ContentSink plainTextSink;
            try {
                plainTextSink = ciFactory.createContentSink(TEXT_PLAIN +"; charset="+UTF8.name());
            } catch (IOException e) {
                IOUtils.closeQuietly(in); //close the input stream
                throw new EngineException("Error while initialising Blob for" +
                    "writing the text/plain version of the parsed content",e);
            }
            final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
            final ContentHandler textHandler = new BodyContentHandler( //only the Body
                new PlainTextHandler(plainTextWriter, false,skipLinebreaks)); //skip ignoreable
            final ToXMLContentHandler xhtmlHandler;
            final ContentHandler mainHandler;
            ContentSink xhtmlSink = null;
            try {
                if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
                    try {
                        xhtmlSink = ciFactory.createContentSink(XHTML +"; charset="+UTF8.name());
                    } catch (IOException e) {
                        throw new EngineException("Error while initialising Blob for" +
                                "writing the application/xhtml+xml version of the parsed content",e);
                    }
                    try {
                        xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(),UTF8.name());
                    } catch (UnsupportedEncodingException e) {
                        throw new EngineException("This system does not support the encoding "+UTF8,e);
                    }
                    mainHandler = new MultiHandler(textHandler,xhtmlHandler);
                } else {
                    mainHandler = textHandler;
                    xhtmlHandler = null;
                    xhtmlSink = null;
                }
                try {
                    AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
                        public Object run() throws IOException, SAXException, TikaException {
                            parser.parse(in, mainHandler, metadata, context);
                            return null;
                        }
                    });
                } catch (PrivilegedActionException pae) {
                    Exception e = pae.getException();
                    if(e instanceof IOException || e instanceof SAXException || e instanceof TikaException){
                        throw new EngineException("Unable to convert ContentItem "+
                                ci.getUri()+" with mimeType '"+ci.getMimeType()+"' to "+
                                "plain text!",e);
                    } else { //runtime exception
                        throw RuntimeException.class.cast(e);
                    }
                }
            } finally { //ensure that the writers are closed correctly
                IOUtils.closeQuietly(in);
                IOUtils.closeQuietly(plainTextWriter);
                if(xhtmlSink != null){
                    IOUtils.closeQuietly(xhtmlSink.getOutputStream());
                }
            }
            String random = randomUUID().toString();
            UriRef textBlobUri = new UriRef("urn:tika:text:"+random);
            ci.addPart(textBlobUri, plainTextSink.getBlob());
            if(xhtmlHandler != null){
                UriRef xhtmlBlobUri = new UriRef("urn:tika:xhtml:"+random);
                ci.addPart(xhtmlBlobUri,  xhtmlSink.getBlob());
            }
            //add the extracted metadata
            if(log.isInfoEnabled()){
                for(String name : metadata.names()){
                    log.info("{}: {}",name,Arrays.toString(metadata.getValues(name)));
View Full Code Here

TOP

Related Classes of org.apache.stanbol.enhancer.servicesapi.ContentSink

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.