Package org.apache.stanbol.entityhub.indexing.core.impl

Source Code of org.apache.stanbol.entityhub.indexing.core.impl.IndexerImpl

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.entityhub.indexing.core.impl;
import static org.apache.stanbol.entityhub.indexing.core.impl.IndexerConstants.INDEXING_COMPLETED_QUEUE_ITEM;
import static org.apache.stanbol.entityhub.indexing.core.impl.IndexerConstants.SCORE_FIELD;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.stanbol.entityhub.indexing.core.EntityDataIterable;
import org.apache.stanbol.entityhub.indexing.core.EntityDataIterator;
import org.apache.stanbol.entityhub.indexing.core.EntityDataProvider;
import org.apache.stanbol.entityhub.indexing.core.EntityIterator;
import org.apache.stanbol.entityhub.indexing.core.EntityProcessor;
import org.apache.stanbol.entityhub.indexing.core.EntityScoreProvider;
import org.apache.stanbol.entityhub.indexing.core.Indexer;
import org.apache.stanbol.entityhub.indexing.core.IndexingComponent;
import org.apache.stanbol.entityhub.indexing.core.IndexingDestination;
import org.apache.stanbol.entityhub.indexing.core.destination.OsgiConfigurationUtil;
import org.apache.stanbol.entityhub.indexing.core.event.IndexingEvent;
import org.apache.stanbol.entityhub.indexing.core.event.IndexingListener;
import org.apache.stanbol.entityhub.indexing.core.impl.IndexingDaemon.IndexingDaemonEventObject;
import org.apache.stanbol.entityhub.indexing.core.impl.IndexingDaemon.IndexingDaemonListener;
import org.apache.stanbol.entityhub.indexing.core.impl.IndexingSourceInitialiser.IndexingSourceEventObject;
import org.apache.stanbol.entityhub.indexing.core.impl.IndexingSourceInitialiser.IndexingSourceInitialiserListener;
import org.apache.stanbol.entityhub.indexing.core.normaliser.ScoreNormaliser;
import org.apache.stanbol.entityhub.indexing.core.processor.EmptyProcessor;
import org.apache.stanbol.entityhub.indexing.core.source.LineBasedEntityIterator;
import org.apache.stanbol.entityhub.indexing.core.source.YardEntityDataProvider;
import org.apache.stanbol.entityhub.servicesapi.model.Representation;
import org.apache.stanbol.entityhub.servicesapi.yard.Yard;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Abstract Implementation of the Indexer. <p>
* Principally there are two ways how to index entities <ol>
* <li> Iterate of the the entityIds/scores by using an {@link EntityIterator}
* and lookup the data by using an {@link EntityDataProvider}.
* <li> Iterate over the data by using an {@link EntityDataIterator} (provided
* by an {@link EntityDataIterable}) and lookup/calculate the scores by using an
* {@link EntityScoreProvider}.
* </ol>
* This Implementation provides a static createInstance(..) method for each of
* the two variants.<p>
* After the Entities are loaded from the source they are processed by using the
* configured {@link EntityProcessor}. Finally the processed entities are
* persisted in a {@link Yard}.
* @author Rupert Westenthaler
*
*/
public class IndexerImpl implements Indexer {
    //protected to allow internal classes direct access!
    protected static final Logger log = LoggerFactory.getLogger(IndexerImpl.class);

    /**
     * Holds the indexing listener
     */
    private final Set<IndexingListener> listeners;

    private int chunkSize;
    public static final int MIN_QUEUE_SIZE = 500;
   
    private boolean indexAllEntitiesState = false;
   
    //entityMode
    private EntityIterator entityIterator;
    private EntityDataProvider dataProvider;
    //dataMode
    private EntityDataIterable dataIterable;
    private EntityScoreProvider scoreProvider;
    private final Collection<IndexingComponent> indexingComponents;
   
    private final IndexingDestination indexingDestination;
    private final List<EntityProcessor> entityProcessors;
    private final ScoreNormaliser scoreNormaliser;
   
    private State state = State.UNINITIALISED;
    private final Object stateSync = new Object();
    /**
     * The file to store the IDs of indexed entities. Might be <code>null</code>
     * if {@link #entityPostProcessors} is <code>null</code>.
     */
    private File indexedEntityIdFile;
    /**
     * The {@link EntityProcessor}s used for the post processing phase or
     * <code>null</code> if none.<p>
     * If NOT <code>null</code> {@link #indexedEntityIdFile} is also
     * guaranteed to be NET <code>null</code>!
     */
    private List<EntityProcessor> entityPostProcessors;

    private OutputStream indexedEntityIdOutputStream;
   
    public IndexerImpl(EntityIterator entityIterator,
                       EntityDataProvider dataProvider,
                       ScoreNormaliser normaliser,
                       IndexingDestination indexingDestination,
                       List<EntityProcessor> entityProcessors,
                       File indexedEntityIdFile,
                       List<EntityProcessor> entityPostProcessors){
        this(normaliser,indexingDestination,entityProcessors,indexedEntityIdFile,entityPostProcessors);
        //set entityMode interfaces
        if(entityIterator == null){
            throw new IllegalArgumentException("The EntityIterator MUST NOT be NULL!");
        }
        this.entityIterator = entityIterator;
        this.dataProvider = dataProvider;
        //add the parsed indexingSources to the list
        this.indexingComponents.add(entityIterator);
        this.indexingComponents.add(dataProvider);
    }
    public IndexerImpl(EntityDataIterable dataIterable,
                       EntityScoreProvider scoreProvider,
                       ScoreNormaliser normaliser,
                       IndexingDestination indexingDestination,
                       List<EntityProcessor> entityProcessors,
                       File indexedEntityIdFile,
                       List<EntityProcessor> entityPostProcessors){
        this(normaliser,indexingDestination,entityProcessors,indexedEntityIdFile,entityPostProcessors);
        //deactivate entityMode interfaces
        this.entityIterator = null;
        if(scoreProvider == null){
            throw new IllegalArgumentException("The EntityScoreProvider MUST NOT be NULL!");
        }
        this.scoreProvider = scoreProvider;
        this.dataIterable = dataIterable;
        //add the parsed indexingSources to the list
        this.indexingComponents.add(scoreProvider);
        this.indexingComponents.add(dataIterable);
    }
   
    protected IndexerImpl(ScoreNormaliser normaliser,
                          IndexingDestination indexingDestination,
                          List<EntityProcessor> entityProcessors,
                          File indexedEntityIdFile,
                          List<EntityProcessor> entityPostProcessors){
        if(indexingDestination == null){
            throw new IllegalArgumentException("The Yard MUST NOT be NULL!");
        }
        this.indexingDestination = indexingDestination;
        if(entityProcessors == null){
            this.entityProcessors = Collections.singletonList((EntityProcessor)new EmptyProcessor());
        } else {
            this.entityProcessors = entityProcessors;
        }
        setChunkSize(DEFAULT_CHUNK_SIZE); //init the chunk size and the cache
        this.scoreNormaliser = normaliser;
        indexingComponents = new ArrayList<IndexingComponent>();
        indexingComponents.add(indexingDestination);
        indexingComponents.addAll(entityProcessors);
        listeners = new HashSet<IndexingListener>();
        this.indexedEntityIdFile = indexedEntityIdFile;
        if(entityPostProcessors != null){
            if(entityPostProcessors.isEmpty()){
                this.entityPostProcessors = null;
            } else {
                this.entityPostProcessors = entityPostProcessors;
                if(indexedEntityIdFile == null){
                    throw new IllegalArgumentException("The file used to store" +
                        "the IDs of indexed Entities MUST NOT be NULL if" +
                        "EntityPostProcessors are defined!");
                }
            }
        } else {
            this.entityPostProcessors = null;
        }
    }
    public boolean addIndexListener(IndexingListener listener){
        if(listener != null){
            synchronized (listeners) {
                return listeners.add(listener);
            }
        } else {
            return false;
        }
    }
    public boolean removeIndexListener(IndexingListener listener){
        if(listener != null){
            synchronized (listeners) {
                return listeners.remove(listener);
            }
        } else {
            return false;
        }
    }
    protected void fireStateChanged(){
        IndexingEvent event = new IndexingEvent(this);
        Collection<IndexingListener> copy = new ArrayList<IndexingListener>(listeners.size());
        synchronized (listeners) {
            copy.addAll(listeners);
        }       
        for(IndexingListener listener : copy){
            listener.stateChanged(event);
            //if the state is finished also send the completed event
            if(getState() == State.FINISHED){
                listener.indexingCompleted(event);
            }
        }
    }
   
    /* (non-Javadoc)
     * @see org.apache.stanbol.entityhub.indexing.core.IndexerInterface#setChunkSize(int)
     */
    public void setChunkSize(int chunkSize) throws IllegalStateException {
        if(getState().ordinal() >= State.INDEXING.ordinal()){
            throw new IllegalStateException("Setting the chunkSize is only allowed before starting the indexing process!");
        }
        if(chunkSize <= 0){
            chunkSize = DEFAULT_CHUNK_SIZE;
        }
        this.chunkSize = chunkSize;
    }
    /* (non-Javadoc)
     * @see org.apache.stanbol.entityhub.indexing.core.IndexerInterface#getChunkSize()
     */
    public int getChunkSize() {
        return chunkSize;
    }
    /* (non-Javadoc)
     * @see org.apache.stanbol.entityhub.indexing.core.IndexerInterface#getYard()
     */
    public Yard getYard() {
        return indexingDestination.getYard();
    }
    @Override
    public void initialiseIndexing() {
        synchronized (stateSync) { //ensure that two threads do not start the
            //initialisation at the same time ...
            if(getState() != State.UNINITIALISED){
                return;
            }
            setState(State.INITIALISING);
            log.info("Initialisation started ...");
        }
        //add all IndexingSources that need to be initialised to a set
        final Collection<IndexingComponent> toInitialise = new HashSet<IndexingComponent>();
        //we need an simple listener that removes the IndexingSerouces from the
        //above list
        final IndexingSourceInitialiserListener listener = new IndexingSourceInitialiserListener() {
            @Override
            public void indexingSourceInitialised(IndexingSourceEventObject eventObject) {
                //remove the IndexingSource from the toInitialise set
                synchronized (toInitialise) {
                    toInitialise.remove(eventObject.getIndexingSource());
                    if(toInitialise.isEmpty()){ //if no more left to initialise
                        //notify others about it
                        toInitialise.notifyAll();
                    }
                }
                //finally remove this listener
                eventObject.getSource().removeIndexingSourceInitialiserListener(this);
            }
        };
        //now create the IndexingSourceInitialiser that initialise the
        //Indexing Sources in their own Thread
        for(IndexingComponent source : indexingComponents){
            if(source.needsInitialisation()){ //if it need to be initialised
                toInitialise.add(source); // add it to the list
                //create an initialiser
                IndexingSourceInitialiser initialiser = new IndexingSourceInitialiser(source);
                //add the listener
                initialiser.addIndexingSourceInitialiserListener(listener);
                //create and init the Thread
                Thread thread = new Thread(initialiser);
                thread.setDaemon(true);
                thread.start();
            } //else no initialisation is needed
        }
        //now wait until all IndexingSources are initialised!
        while(!toInitialise.isEmpty()){
            synchronized (toInitialise) {
                try {
                    toInitialise.wait();
                } catch (InterruptedException e) {
                    //year looks like all IndexingSources are initialised!
                }
            }
        }
        //initialise the stream used to write the ids of indexed entities
        try {
            indexedEntityIdOutputStream = getEntityIdFileOutputStream();
        } catch (IOException e) {
            if(entityPostProcessors != null){
                throw new IllegalStateException("Unable to open stream for writing" +
                        "the IDs of indexed Entities as required for post-" +
                        "processing!",e);
            } else {
                log.warn("Unable to open stream for writing the Ids of indexed " +
                        "Entities -> indexes entity Ids will not be available!",e);
            }
        }

        log.info("Initialisation completed");
        setState(State.INITIALISED);
    }
    /* (non-Javadoc)
     * @see org.apache.stanbol.entityhub.indexing.core.IndexerInterface#index()
     */
    public void index(){
        Set<State> supportedStates = EnumSet.of(
            State.UNINITIALISED,State.INITIALISED,State.INDEXED,State.FINISHED);
        //this is only used to inform about wrong usage. It does not ensure
        //that index is called twice by different threads. This check is done
        //within the initialise, index and finalise methods!
        State state = getState();
        if(!supportedStates.contains(state)){
            throw new IllegalStateException(String.format(
                "Calling this Method is not supported while in State %s! Supported States are ",
                state,supportedStates));
        }
        log.info("start initialisation ...");
        initialiseIndexing();
        log.info("  ... initialisation completed");
        //if now the state is an unsupported one it indicates that
        //initialiseIndexingSources() was called by an other thread before this one!
        state = getState();
        if(!supportedStates.contains(state)){
            throw new IllegalStateException(String.format(
                "Calling this Method is not supported while in State %s! Supported States are ",
                state,supportedStates));
        }
        log.info("start indexing ...");
        indexEntities();
        log.info("  ... indexing completed");
        log.info("start post-processing ...");
        postProcessEntities();
        log.info("  ... post-processing finished ...");
        log.info("start finalisation....");
        finaliseIndexing();
        log.info("  ...finalisation completed");
    }
    @Override
    public void postProcessEntities() {
        synchronized (stateSync) { //ensure that two threads do not start the
            //initialisation at the same time ...
            State state = getState();
            if(state.ordinal() < State.INDEXED.ordinal()){
                throw new IllegalStateException("The Indexer MUST BE already "+State.INDEXED+" when calling this Method!");
            }
            if(state != State.INDEXED){ //if state > INITIALISED
                return; // ignore this call
            }
            setState(State.POSTPROCESSING);
            log.info("Indexing started ...");
        }
        if(entityPostProcessors == null || entityPostProcessors.isEmpty()){
            setState(State.POSTPROCESSED);
            return; //nothing to do
        }
        //init the post processing components
        //use an EntityDataProvider based on the indexed data
        EntityDataProvider dataProvider = new YardEntityDataProvider(indexingDestination.getYard());
        //use an LineBasedEntityIterator to iterate over the indexed entity ids
        EntityIterator entityIterator;
        try {
            entityIterator = new LineBasedEntityIterator(getEntityIdFileInputStream(),"UTF-8",null);
        catch (IOException e) {
            throw new IllegalStateException("Unable to open file containing the " +
                "IDs of the indexed Entities!",e);
        }
        Map<String,Object> config = new HashMap<String,Object>();
        config.put(LineBasedEntityIterator.PARAM_ID_POS, 1);
        config.put(LineBasedEntityIterator.PARAM_SCORE_POS, Integer.MAX_VALUE);
        entityIterator.setConfiguration(config);
        //init the post-processors (this time not in an own thread as this
        //does not really make sense for processors
        for(EntityProcessor processor : entityPostProcessors){
            if(processor.needsInitialisation()){
                processor.initialise();
            }
        }
        //NOTE the destination needs not to be initialised -> it will be the same
        //as for indexing!

        //initialisation complete ... now setup the poet processing
        //init the queues
        int queueSize = Math.max(MIN_QUEUE_SIZE, chunkSize*2);
        BlockingQueue<QueueItem<Representation>> indexedEntityQueue =
                new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
        BlockingQueue<QueueItem<Representation>> processedEntityQueue =
                new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
        BlockingQueue<QueueItem<Representation>> finishedEntityQueue =
                new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
        BlockingQueue<QueueItem<IndexingError>> errorEntityQueue =
                new ArrayBlockingQueue<QueueItem<IndexingError>>(queueSize);

        //Set holding all active post processing deamons
        final SortedSet<IndexingDaemon<?,?>> activeIndexingDeamons =
            new TreeSet<IndexingDaemon<?,?>>();
        //create the IndexingDaemos
        //TODO: Here we would need to create multiple instances in case
        //      one would e.g. like to use several threads for processing entities
        //(1) the daemon reading from the IndexingSources
        String entitySourceReaderName = "Post-processing: Entity Reader Deamon";
        activeIndexingDeamons.add(
            new EntityIdBasedIndexingDaemon(
                entitySourceReaderName,
                indexedEntityQueue, errorEntityQueue,
                entityIterator,
                dataProvider,
                null, //no score normaliser
                true)); //post-process all indexed entities
        //(2) The daemon for post-processing the entities
        activeIndexingDeamons.add(
            new EntityProcessorRunnable(
                "Post-processing: Entity Processor Deamon",
                indexedEntityQueue, //it consumes indexed Entities
                processedEntityQueue,  //it produces processed Entities
                errorEntityQueue,
                entityPostProcessors,
                //TODO: check that the score is not overriden by the NULL
                //      parsed by the used LineBasedEntityIterator!
                Collections.singleton(SCORE_FIELD))); //ensure the score not changed
        //(3) The daemon for persisting the entities
        activeIndexingDeamons.add(
            new EntityPersisterRunnable(
                "Indexing: Entity Perstisting Deamon",
                processedEntityQueue, //it consumes processed Entities
                finishedEntityQueue, //it produces finished Entities
                errorEntityQueue,
                chunkSize, indexingDestination.getYard()));
        //(4) The daemon for logging finished entities
        activeIndexingDeamons.add(
            new FinishedEntityDaemon(
                finishedEntityQueue, -1, log,
                null)); //we have already all entity ids!
        //(5) The daemon for logging errors
        activeIndexingDeamons.add(
            new EntityErrorLoggerDaemon(
            errorEntityQueue, log));
        //start post-processing and wait until it has finished
        startAndWait(activeIndexingDeamons);       
       
       
        setState(State.POSTPROCESSED);
    }
    /**
     * Internally used to start the indexing/post-processing daemons and wait
     * until they have finished.
     * @param activeIndexingDeamons the deamos to start
     */
    private void startAndWait(final SortedSet<IndexingDaemon<?,?>> activeIndexingDeamons) {
        //We need an listener for the IndexingDaemons we are about to start!
        final IndexingDaemonListener listener = new IndexingDaemonListener() {
            @Override
            public void indexingDaemonFinished(IndexingDaemonEventObject indexingDaemonEventObject) {
                //looks like one has finished
                IndexingDaemon<?,?> indexingDaemon = indexingDaemonEventObject.getSource();
                //handle the finished indexing daemon
                handleFinishedIndexingDaemon(activeIndexingDeamons, indexingDaemon);
                //finally remove the listener
                indexingDaemon.removeIndexingDaemonListener(this);

            }
        };
        //now start the IndexingDaemons in their own Threads
        Set<IndexingDaemon<?,?>> deamonCopy =
            new HashSet<IndexingDaemon<?,?>>(activeIndexingDeamons);
        for(IndexingDaemon<?,?> deamon : deamonCopy){
            deamon.addIndexingDaemonListener(listener); //add the listener
            Thread thread = new Thread(deamon);// create the thread
            thread.setDaemon(true); //ensure that the JVM can terminate
            thread.setName(deamon.getName()); // set the name of the thread
            thread.start(); //start the Thread
        }
        //now we need to wait until all Threads have finished ...
        while(!activeIndexingDeamons.isEmpty()){
            synchronized (activeIndexingDeamons) {
                try {
                    activeIndexingDeamons.wait();
                } catch (InterruptedException e) {
                    //year ... looks like we are done
                }
            }
        }
        //done!
    }

    @Override
    public void finaliseIndexing() {
        synchronized (stateSync) { //ensure that two threads do not start the
            //initialisation at the same time ...
            State state = getState();
            if(state.ordinal() < State.POSTPROCESSED.ordinal()){
                throw new IllegalStateException("The Indexer MUST BE already "+State.POSTPROCESSED+" when calling this Method!");
            }
            if(state != State.POSTPROCESSED){ //if state > POSTPROCESSED
                return; // ignore this call
            }
            setState(State.FINALISING);
            log.info("Indexing started ...");
        }
        indexingDestination.finalise();
        setState(State.FINISHED);
    }
    @Override
    public void indexEntities() {
        synchronized (stateSync) { //ensure that two threads do not start the
            //initialisation at the same time ...
            State state = getState();
            if(state.ordinal() < State.INITIALISED.ordinal()){
                throw new IllegalStateException("The Indexer MUST BE already "+State.INITIALISED+" when calling this Method!");
            }
            if(state != State.INITIALISED){ //if state > INITIALISED
                return; // ignore this call
            }
            setState(State.INDEXING);
            log.info("Indexing started ...");
        }
        //init the queues
        int queueSize = Math.max(MIN_QUEUE_SIZE, chunkSize*2);
        BlockingQueue<QueueItem<Representation>> indexedEntityQueue =
                new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
        BlockingQueue<QueueItem<Representation>> processedEntityQueue =
                new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
        BlockingQueue<QueueItem<Representation>> finishedEntityQueue =
                new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
        BlockingQueue<QueueItem<IndexingError>> errorEntityQueue =
                new ArrayBlockingQueue<QueueItem<IndexingError>>(queueSize);
       
        //Set holding all active IndexingDaemons
        final SortedSet<IndexingDaemon<?,?>> activeIndexingDeamons =
            new TreeSet<IndexingDaemon<?,?>>();
        //create the IndexingDaemos
        //TODO: Here we would need to create multiple instances in case
        //      one would e.g. like to use several threads for processing entities
        //(1) the daemon reading from the IndexingSources
        String entitySourceReaderName = "Indexing: Entity Source Reader Deamon";
        if(entityIterator != null){
            activeIndexingDeamons.add(
                new EntityIdBasedIndexingDaemon(
                    entitySourceReaderName,
                    indexedEntityQueue, errorEntityQueue,
                    entityIterator,
                    dataProvider,
                    scoreNormaliser,
                    indexAllEntitiesState));
        } else {
            activeIndexingDeamons.add(
                new EntityDataBasedIndexingDaemon(
                    entitySourceReaderName,
                    indexedEntityQueue, errorEntityQueue,
                    dataIterable,
                    scoreProvider,
                    scoreNormaliser,
                    indexAllEntitiesState));
        }
        //(2) The daemon for processing the entities
        activeIndexingDeamons.add(
            new EntityProcessorRunnable(
                "Indexing: Entity Processor Deamon",
                indexedEntityQueue, //it consumes indexed Entities
                processedEntityQueue,  //it produces processed Entities
                errorEntityQueue,
                entityProcessors,
                Collections.singleton(SCORE_FIELD)));
        //(3) The daemon for persisting the entities
        activeIndexingDeamons.add(
            new EntityPersisterRunnable(
                "Indexing: Entity Perstisting Deamon",
                processedEntityQueue, //it consumes processed Entities
                finishedEntityQueue, //it produces finished Entities
                errorEntityQueue,
                chunkSize, indexingDestination.getYard()));
        //(4) The daemon for logging finished entities
        activeIndexingDeamons.add(
            new FinishedEntityDaemon(
                finishedEntityQueue, -1, log, indexedEntityIdOutputStream));
        //(5) The daemon for logging errors
        activeIndexingDeamons.add(
            new EntityErrorLoggerDaemon(
            errorEntityQueue, log));
        //start indexing and wait until it has finished
        startAndWait(activeIndexingDeamons);
        //set the new state to INDEXED
        setState(State.INDEXED);
    }
    /**
     * Handles the necessary actions if an {@link IndexingDaemon} used for the
     * work done within {@link #indexEntities()} completes its work (meaning
     * it has executed all entities).<p>
     * The parsed SortedSet is created within  {@link #indexEntities()} and
     * contains all {@link IndexingDaemon}s that have not yet finished. It is
     * the responsibility of this method to remove finished
     * {@link IndexingDaemon}s from this set.<p>
     * In addition this Method is responsible to {@link BlockingQueue#put(Object)}
     * the {@link IndexerConstants#INDEXING_COMPLETED_QUEUE_ITEM} to the
     * consuming queue of {@link IndexingDaemon}s as soon as all
     * {@link IndexingDaemon}s of the previous indexing steps have finished.
     * This is checked comparing the {@link IndexingDaemon#getSequence()} number
     * of the first entry within the sorted activeIndexingDeamons set.
     * If the sequence number of the first element has increased after the
     * finished {@link IndexingDaemon} was removed this Method puts the
     * {@link IndexerConstants#INDEXING_COMPLETED_QUEUE_ITEM} item to the
     * consuming queue of the new first entry of activeIndexingDeamons.
     * @param activeIndexingDeamons The SortedSet containing all
     * {@link IndexingDaemon}s that are still active.
     * @param indexingDaemon the {@link IndexingDaemon} that completed its work.
     */
    private void handleFinishedIndexingDaemon(final SortedSet<IndexingDaemon<?,?>> activeIndexingDeamons,
                                              IndexingDaemon<?,?> indexingDaemon) {
        log.info("{} completed (sequence={}) ... ",
            indexingDaemon.getName(), indexingDaemon.getSequence());
        IndexingDaemon<?,?> sendEndofQueue = null;
        synchronized (activeIndexingDeamons) {
            if(log.isDebugEnabled()){
                log.info(" Active Indexing Deamons:");
                for(IndexingDaemon<?,?> active : activeIndexingDeamons){
                    log.info(" > {} {}",active.getSequence(),active.getName());
                }
            }
            //get the SequenceNumber of the first Element
            if(!activeIndexingDeamons.isEmpty()){
                Integer sequenceNumber = activeIndexingDeamons.first().getSequence();
                log.info(" > current sequence : {}",sequenceNumber);
                //try to remove it from the activeDeamons list
                activeIndexingDeamons.remove(indexingDaemon);
                if(activeIndexingDeamons.isEmpty()){ //if no active is left
                    log.debug("  - indexingDeamons list now emoty ... notifyAll to indicate indexing has completed!");
                    activeIndexingDeamons.notifyAll(); //notify all others!
                } else { //check new SequenceNumber
                    IndexingDaemon<?,?> first = activeIndexingDeamons.first();
                    if(sequenceNumber.compareTo(first.getSequence()) < 0){
                        log.info(" > new sequence: {}",first.getSequence());
                        //sequence number increased ->
                        // ... all Daemons for the step have completed
                        // ... send EndOfQueue
                        // ... but outside of the synchronized block!
                        sendEndofQueue = first;
                    }
                }
            } //already empty ... nothing todo
        }
        if(sendEndofQueue != null){ //send endOfQueue
            //to the consuming Queue of this one
            try {
                //ignore the Type safety because the item is of
                //INDEXING_COMPLETED_QUEUE_ITEM is anyway null
                log.info("Send end-of-queue to Deamons with Sequence "+sendEndofQueue.getSequence());
                sendEndofQueue.getConsumeQueue().put(INDEXING_COMPLETED_QUEUE_ITEM);
            } catch (InterruptedException e) {
                log.error("Interupped while sending EnodOfQueue Item to consuming queue of "+sendEndofQueue.getName(),e);
            }
        }
    }       
    /* (non-Javadoc)
     * @see org.apache.stanbol.entityhub.indexing.core.IndexerInterface#setIndexAllEntitiesState(boolean)
     */
    public void setIndexAllEntitiesState(boolean indexAllEntitiesState) {
        this.indexAllEntitiesState = indexAllEntitiesState;
    }
    /* (non-Javadoc)
     * @see org.apache.stanbol.entityhub.indexing.core.IndexerInterface#isIndexAllEntitiesState()
     */
    public boolean isIndexAllEntitiesState() {
        return indexAllEntitiesState;
    }
    /**
     * Setter for the state. <p>
     * Implementation Note: This setter is synchronised to the sync object for
     * the state
     * @param state the state to set
     */
    private void setState(State state) {
        boolean changed;
        synchronized (stateSync) {
            changed = state != this.state;
            if(changed){
                this.state = state;
            }
        }
        if(changed){ //do not fire events within synchronized blocks ...
            fireStateChanged();
        }
    }
    /* (non-Javadoc)
     * @see org.apache.stanbol.entityhub.indexing.core.IndexerInterface#getState()
     */
    public State getState() {
        return state;
    }
    /**
     * Opens a stream to read data from the {@link #indexedEntityIdFile}.
     * Can only be called in {@link State}s earlier that {@link State#INDEXING}.
     * @return the stream
     * @throws IOException on any error while opening the stream
     * @throws IllegalStateException if {@link #getState()} is later than
     * {@link State#INITIALISED}
     */
    protected OutputStream getEntityIdFileOutputStream() throws IOException {
        if(indexedEntityIdFile == null){
            return null;
        }
        State state = getState();
        if(state.ordinal() > State.INITIALISED.ordinal()){
            throw new IllegalStateException("Opening an OutputStrem to the "
                    + "indexed entity id file '"+indexedEntityIdFile+"' is not "
                    + "allowed for states > "+State.INITIALISED +" (current: "+state+")!");
        }
        if(indexedEntityIdFile.isFile()){//exists
            log.info(" ... delete existing IndexedEntityId file "+indexedEntityIdFile);
            indexedEntityIdFile.delete(); //delete existing data
        }
        //support compression
        String extension = FilenameUtils.getExtension(indexedEntityIdFile.getName());
        OutputStream out = new FileOutputStream(indexedEntityIdFile);
        if("zip".equalsIgnoreCase(extension)){
            out = new ZipOutputStream(out);
            ((ZipOutputStream)out).putNextEntry(new ZipEntry("entity-ids.txt"));
        } else if("gz".equalsIgnoreCase(extension)) {
            out = new GZIPOutputStream(out);
        } else if("bz2".equalsIgnoreCase(extension)) {
            out = new BZip2CompressorOutputStream(out);
        }
        return out;
    }
    /**
     * Opens a stream to read data from the {@link #indexedEntityIdFile}.
     * Can only be called in {@link State}s later that {@link State#INDEXED}.
     * @return the stream
     * @throws IOException on any error while creating the stream
     * @throws IllegalStateException if {@link #getState()} is earlier than
     * {@link State#INDEXED}
     */
    protected InputStream getEntityIdFileInputStream() throws IOException {
        if(indexedEntityIdFile == null){
            return null;
        }
        State state = getState();
        if(state.ordinal() < State.INDEXED.ordinal()){
            throw new IllegalStateException("The indexed entity id data is not" +
                "available for states < "+State.INDEXED +" (current: "+state+")!");
        }
        //support compression
        String extension = FilenameUtils.getExtension(indexedEntityIdFile.getName());
        InputStream in = new FileInputStream(indexedEntityIdFile);
        if("zip".equalsIgnoreCase(extension)){
            in = new ZipInputStream(in);
            ((ZipInputStream)in).getNextEntry();
        } else if("gz".equalsIgnoreCase(extension)) {
            in = new GZIPInputStream(in);
        } else if("bz2".equalsIgnoreCase(extension)) {
            in = new BZip2CompressorInputStream(in);
        }
        return in;
    }

}
TOP

Related Classes of org.apache.stanbol.entityhub.indexing.core.impl.IndexerImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.