Package org.wso2.carbon.registry.core.jdbc.indexing

Source Code of org.wso2.carbon.registry.core.jdbc.indexing.Indexer

/*
* Copyright (c) 2008, WSO2 Inc. (http://www.wso2.org) All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.wso2.carbon.registry.core.jdbc.indexing;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.jdbc.JdbcDirectory;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import org.wso2.carbon.registry.core.Resource;
import org.wso2.carbon.registry.core.config.RegistryContext;
import org.wso2.carbon.registry.core.dao.ResourceDAO;
import org.wso2.carbon.registry.core.exceptions.RegistryException;
import org.wso2.carbon.registry.core.jdbc.handlers.RequestContext;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;

public class Indexer {

    private static final Log log = LogFactory.getLog(Indexer.class);

    private StringBuffer sb = new StringBuffer();
    private String id;
    private String contentString;
    private String url;
    private InputStream is;
    private BufferedReader br;

    private URL resourceURL;

    public void updateIndex(RequestContext requestContext) throws RegistryException {
        String line;
        Resource resource = requestContext.getResource();

        try {
            getId(requestContext);
            Object contentObj = resource.getContent();
            byte[] content;
            if (contentObj instanceof String) {
                content = ((String) contentObj).getBytes();
            } else {
                content = (byte[]) contentObj;
            }
            is = resource.getContentStream();
            if (content != null) {
                contentString = new String(content);
            }
            url = requestContext.getSourceURL();
            if (url != null) {
                validateForLocalUrl(url);
                resourceURL = new URL(url);
                is = resourceURL.openStream();

                br = new BufferedReader(new InputStreamReader(is));
                while ((line = br.readLine()) != null) {
                    sb.append(line).append("\n");
                }
                contentString = sb.toString();
                is.close();
            }

            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", contentString, Field.Store.NO, Field.Index.TOKENIZED));
            IndexWriter writer = new IndexWriter(RegistryContext.getBaseInstance().getJdbcDir(),
                    new StandardAnalyzer());
            writer.addDocument(document);
            writer.optimize();
            writer.close();
        } catch (CorruptIndexException e) {
        } catch (IOException e) {
        }
    }

    public void indexXML(RequestContext requestContext) throws RegistryException {
        String line;
        final StringBuffer contentOnly = new StringBuffer();
        Resource resource = requestContext.getResource();

        try {
            getId(requestContext);
            is = resource.getContentStream();
            url = requestContext.getSourceURL();
            if (url != null) {
                validateForLocalUrl(url);
                resourceURL = new URL(url);
                is = resourceURL.openStream();
            }

            DefaultHandler handler = new DefaultHandler() {
                public void characters(char ch[], int start, int length) throws SAXException {
                    contentOnly.append(new String(ch, start, length));
                }
            };
            SAXParserFactory factory = SAXParserFactory.newInstance();
            SAXParser saxParser = factory.newSAXParser();
            saxParser.parse(is, handler);

            if (url != null) {
                is = resourceURL.openStream();
            } else {
                is = resource.getContentStream();
            }
            br = new BufferedReader(new InputStreamReader(is));
            while ((line = br.readLine()) != null) {
                sb.append(line + "\n");
            }
            is.close();

            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", sb.toString(), Field.Store.NO, Field.Index.TOKENIZED));
            document.add(new Field("contentOnly", contentOnly.toString(), Field.Store.NO,
                    Field.Index.TOKENIZED));

            IndexWriter writer = new IndexWriter(RegistryContext.getBaseInstance().getJdbcDir(),
                    new StandardAnalyzer());
            writer.addDocument(document);
            writer.optimize();
            writer.close();
        } catch (IOException e) {
            String msg = "Failed to write to the index";
            log.error(msg);
            throw new RegistryException(msg);
        } catch (SAXException e) {
            String msg = "Failed to parse XML";
            log.error(msg);
            throw new RegistryException(msg);
        } catch (ParserConfigurationException e) {
            String msg = "Failed to parse XML";
            log.error(msg);
            throw new RegistryException(msg);
        }

    }

    public void indexPDF(RequestContext requestContext) throws RegistryException {
        Resource resource = requestContext.getResource();

        try {
            getId(requestContext);
            is = resource.getContentStream();
            url = requestContext.getSourceURL();
            if (url != null) {
                validateForLocalUrl(url);
                resourceURL = new URL(url);
                is = resourceURL.openStream();
            }

            PDFParser parser = new PDFParser(is);
            parser.parse();
            COSDocument cosDoc = parser.getDocument();

            PDFTextStripper stripper = new PDFTextStripper();
            String docText = stripper.getText(new PDDocument(cosDoc));
            cosDoc.close();
            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", docText, Field.Store.NO, Field.Index.TOKENIZED));
            IndexWriter writer = new IndexWriter(RegistryContext.getBaseInstance().getJdbcDir(),
                    new StandardAnalyzer());
            writer.addDocument(document);
            writer.optimize();
            writer.close();
        } catch (IOException e) {
            String msg = "Failed to write to the index";
            log.error(msg);
            throw new RegistryException(msg);
        }
    }

    public void indexMSWord(RequestContext requestContext) throws RegistryException {
        Resource resource = requestContext.getResource();
        try {
            getId(requestContext);
            is = resource.getContentStream();
            url = requestContext.getSourceURL();
            if (url != null) {
                validateForLocalUrl(url);
                resourceURL = new URL(url);
                is = resourceURL.openStream();
            }

            POIFSFileSystem fs = new POIFSFileSystem(is);
            WordExtractor extractor = new WordExtractor(fs);
            String wordText = extractor.getText();

            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", wordText, Field.Store.NO, Field.Index.TOKENIZED));
            IndexWriter writer = new IndexWriter(RegistryContext.getBaseInstance().getJdbcDir(),
                    new StandardAnalyzer());
            writer.addDocument(document);
            writer.optimize();
            writer.close();
        } catch (IOException e) {
            String msg = "Failed to write to the index";
            log.error(msg);
            throw new RegistryException(msg);
        }
    }

    public void indexMSExcel(RequestContext requestContext) throws RegistryException {
        Resource resource = requestContext.getResource();
        try {
            getId(requestContext);
            is = resource.getContentStream();
            url = requestContext.getSourceURL();
            if (url != null) {
                validateForLocalUrl(url);
                resourceURL = new URL(url);
                is = resourceURL.openStream();
            }

            POIFSFileSystem fs = new POIFSFileSystem(is);
            ExcelExtractor extractor = new ExcelExtractor(fs);
            String excelText = extractor.getText();

            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", excelText, Field.Store.NO, Field.Index.TOKENIZED));
            IndexWriter writer = new IndexWriter(RegistryContext.getBaseInstance().getJdbcDir(),
                    new StandardAnalyzer());
            writer.addDocument(document);
            writer.optimize();
            writer.close();
        } catch (IOException e) {
            String msg = "Failed to write to the index";
            log.error(msg);
            throw new RegistryException(msg);
        }
    }

    public void indexMSPowerpoint(RequestContext requestContext) throws RegistryException {
        Resource resource = requestContext.getResource();
        try {
            getId(requestContext);
            is = resource.getContentStream();
            url = requestContext.getSourceURL();
            if (url != null) {
                validateForLocalUrl(url);
                resourceURL = new URL(url);
                is = resourceURL.openStream();
            }

            POIFSFileSystem fs = new POIFSFileSystem(is);
            PowerPointExtractor extractor = new PowerPointExtractor(fs);
            String ppText = extractor.getText();

            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", ppText, Field.Store.NO, Field.Index.TOKENIZED));
            IndexWriter writer = new IndexWriter(RegistryContext.getBaseInstance().getJdbcDir(),
                    new StandardAnalyzer());
            writer.addDocument(document);
            writer.optimize();
            writer.close();
        } catch (IOException e) {
            String msg = "Failed to write to the index";
            log.error(msg);
            throw new RegistryException(msg);
        }
    }

    public void deleteFromIndex(RequestContext requestContext) throws RegistryException {
        JdbcDirectory jdbcDir = RegistryContext.getBaseInstance().getJdbcDir();
        Resource resource = requestContext.getResource();
        id = resource.getId();

        try {
            IndexReader reader = IndexReader.open(jdbcDir);
            Term term = new Term("id", id);
            if (reader.docFreq(term) > 0) {
                reader.deleteDocuments(term);
            }
            reader.close();
        } catch (IOException e) {
            String msg = "Failed to delete from the index";
            log.error(msg);
            throw new RegistryException(msg);
        }
    }

    private void validateForLocalUrl(String url) throws RegistryException {
        if (url != null && url.toLowerCase().startsWith("file:")) {
            String msg = "The source URL must not be file in the server's local file system";
            throw new RegistryException(msg);
        }
    }

    private void getId(RequestContext requestContext) throws RegistryException {
        throw new UnsupportedOperationException();
//TODO:*
//        Resource resource = requestContext.getResource();
//        String path = requestContext.getResourcePath().getPath();
//        JdbcDirectory jdbcDir = RegistryContext.getBaseInstance().getJdbcDir();
//
//        try {
//            if (resourceDAO.resourceExists(path)) {
//                id = resourceDAO.getResourceID(path, RegistryContext.getBaseInstance().getDataSource().
//                        getConnection());
//                if (IndexReader.indexExists(jdbcDir)) {
//                    IndexReader reader = IndexReader.open(jdbcDir);
//                    Term term = new Term("id", id);
//                    if (reader.docFreq(term) > 0) {
//                        reader.deleteDocuments(term);
//                    }
//                    reader.close();
//                }
//            } else {
//                id = resource.getId();
//            }
//        } catch (IOException e) {
//            String msg = "Failed to write to the index";
//            log.error(msg);
//            throw new RegistryException(msg);
//        } catch (SQLException e) {
//            String msg = "Failed to connect with the database";
//            log.error(msg);
//            throw new RegistryException(msg);
//        }
    }
}
TOP

Related Classes of org.wso2.carbon.registry.core.jdbc.indexing.Indexer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.