Package com.findwise.utils.tika

Examples of com.findwise.utils.tika.InputStreamParser


    public void process(LocalDocument doc) throws Exception {
        Map<String, Object> urls = FieldHelper.getFieldMatchingPattern(doc.getContentMap(),
                urlFieldPattern);
        UriParser uriParser = new UriParser();
        DocumentParserHelper documentParserHelper = new DocumentParserHelper(addMetaData, addLanguage);
        InputStreamParser inputStreamParser = new InputStreamParser(parser);
        for (String field : urls.keySet()) {
            Iterator<URL> it = uriParser.getUrlsFromObject(urls.get(field)).iterator();
            for (int i = 1; it.hasNext(); i++) {
                String num = (i > 1) ? "" + i : "";
                URL url = it.next();
                URLConnection connection = createConnection(url);
                final InputStream inputStream = connection.getInputStream();
                try {
                    String prefix = field + num + "_";
                    ParsedData parsedData = inputStreamParser.parse(inputStream);
                    documentParserHelper.addParsedDataToDocument(parsedData, doc, prefix);
                } finally {
                    inputStream.close();
                }
            }
View Full Code Here


        return (size <= maxSizeInBytes);
    }

    private void enrichDocumentWithFileContents(LocalDocument doc, InputStream stream) throws IOException,
            SAXException, TikaException {
        InputStreamParser inputStreamParser = new InputStreamParser();
        ParsedData parsedData = inputStreamParser.parse(stream);

        addTextToDocument(doc, parsedData.getContent());
        addMetadataToDocument(doc, parsedData.getMetadata());
    }
View Full Code Here

        List<String> files = doc.getFileNames();
        DocumentParserHelper documentParserHelper = new DocumentParserHelper(addMetaData, addLanguage);
        for (String fileName : files) {
            DocumentFile<Local> df = doc.getFile(fileName);
            String prefix = fileName.replace('.', '_') + "_";
            InputStreamParser inputStreamParser = new InputStreamParser(parser);
            ParsedData parsedData = inputStreamParser.parse(df.getStream());
            documentParserHelper.addParsedDataToDocument(parsedData, doc, prefix);
        }
    }
View Full Code Here

TOP

Related Classes of com.findwise.utils.tika.InputStreamParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.