Source Code of org.apache.stanbol.entityhub.indexing.geonames.GeonamesIndexingSource

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.entityhub.indexing.geonames;


import static org.apache.stanbol.entityhub.indexing.geonames.GeonamesConstants.GEONAMES_ONTOLOGY_NS;
import static org.apache.stanbol.entityhub.indexing.geonames.GeonamesConstants.GEONAMES_RESOURCE_NS;
import static org.apache.stanbol.entityhub.indexing.geonames.GeonamesConstants.getReference;
import static org.apache.stanbol.entityhub.indexing.geonames.GeonamesConstants.valueFactory;


import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.StringTokenizer;


import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
import org.apache.stanbol.entityhub.core.mapping.DefaultFieldMapperImpl;
import org.apache.stanbol.entityhub.core.mapping.FieldMappingUtils;
import org.apache.stanbol.entityhub.core.mapping.ValueConverterFactory;
import org.apache.stanbol.entityhub.core.site.CacheUtils;
import org.apache.stanbol.entityhub.core.utils.TimeUtils;
import org.apache.stanbol.entityhub.indexing.core.EntityDataIterable;
import org.apache.stanbol.entityhub.indexing.core.EntityDataIterator;
import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
import org.apache.stanbol.entityhub.indexing.core.source.ResourceImporter;
import org.apache.stanbol.entityhub.indexing.core.source.ResourceLoader;
import org.apache.stanbol.entityhub.indexing.core.source.ResourceState;
import org.apache.stanbol.entityhub.servicesapi.defaults.DataTypeEnum;
import org.apache.stanbol.entityhub.servicesapi.mapping.FieldMapper;
import org.apache.stanbol.entityhub.servicesapi.model.Representation;
import org.apache.stanbol.entityhub.servicesapi.yard.YardException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


public class GeonamesIndexingSource implements EntityDataIterable, ResourceImporter {
    
    private static final Logger log = LoggerFactory.getLogger(GeonamesIndexingSource.class);
    /**
     * The Parameter used to configure the source folder(s) relative to the
     * {@link IndexingConfig#getSourceFolder()}. The ',' (comma) is used as
     * separator to parsed multiple sources.
     */
    public static final String PARAM_SOURCE_FILE_OR_FOLDER = "source";
    /**
     * The zip archive with the geonames entities
     */
    public static final String GEONAMES_DUMP = "allCountries.zip";


    private IndexingConfig indexingConfig;
    private NamespacePrefixService nsPrefixService;
    private File dataDir;
    
    private ResourceLoader loader = new ResourceLoader(this, false, false);


    protected static class Resource {
        protected final String name;
        protected final InputStream is;
        protected Resource(String name, InputStream is) {
            this.name = name;
            this.is = is;
        }
        
        public String getName() {
            return name;
        }
        
        public LineIterator getEntries() throws IOException {
            if(name.endsWith(".zip")){
                ZipArchiveInputStream zipIn = new ZipArchiveInputStream(is);
                zipIn.getNextEntry();
                return IOUtils.lineIterator(zipIn, "UTF-8");
            } else {
                return IOUtils.lineIterator(is, "UTF-8");
            }
        }
        
    }
    private List<Resource> resourceList = new ArrayList<GeonamesIndexingSource.Resource>();
    private boolean consumed;
    
    @Override
    public void setConfiguration(Map<String,Object> config) {
        indexingConfig = (IndexingConfig)config.get(IndexingConfig.KEY_INDEXING_CONFIG);
        nsPrefixService = indexingConfig.getNamespacePrefixService();
        log.info("reading Geonames data from:");
        Object value = config.get(PARAM_SOURCE_FILE_OR_FOLDER);
        if(value == null){ //if not set use the default
            value = GeonamesConstants.DEFAULT_SOURCE_FOLDER_NAME + GEONAMES_DUMP;
            log.info("No Geonames.org dump source set use the default: {}",value);
        }
        for(String source : value.toString().split(",")){
            File sourceFileOrDirectory = indexingConfig.getSourceFile(source);
            if(sourceFileOrDirectory.exists()){
                //register the configured source with the ResourceLoader
                this.loader.addResource(sourceFileOrDirectory);
            } else {
                if(FilenameUtils.getExtension(source).isEmpty()){
                    //non existent directory -> create
                    //This is typically the case if this method is called to
                    //initialise the default configuration. So we will try
                    //to create the directory users need to copy the source
                    //RDF files.
                    if(!sourceFileOrDirectory.mkdirs()){
                        log.warn("Unable to create directory {} configured to improt geonames.org data from. " +
                                "You will need to create this directory manually before copying the" +
                                "Geonames files into it.",sourceFileOrDirectory);
                        //this would not be necessary because the directory will
                        //be empty - however I like to be consistent and have
                        //all configured and existent files & dirs added the the
                        //resource loader
                        this.loader.addResource(sourceFileOrDirectory);
                    }
                } else {
                    log.warn("Unable to find RDF source {} within the indexing Source folder ",source,indexingConfig.getSourceFolder());
                }
            }
        }
     }


    @Override
    public boolean needsInitialisation() {
        //if there are resources with the state REGISTERED we need an initialisation
        return !loader.getResources(ResourceState.REGISTERED).isEmpty();
    }


    @Override
    public void initialise() {
        loader.loadResources();
    }


    @Override
    public void close() {
        loader = null;
        for(Resource resource : resourceList){
            IOUtils.closeQuietly(resource.is);
        }
    }


    @Override
    public ResourceState importResource(InputStream is, String resourceName) throws IOException {
        resourceList.add(new Resource(resourceName, is));
        return ResourceState.LOADED;
    }


    
    @Override
    public EntityDataIterator entityDataIterator() {
        if(!consumed){
            consumed = true;
        } else {
            throw new IllegalStateException("This implementation supports only a"
                    + "single Iteration of the data.");
        }
        return new EntityDataIterator() {
            
            Iterator<Resource> resources = resourceList.iterator();
            Resource r;
            LineIterator it = null;
            private String next;
            private Representation rep;
            
            
            private String getNext(){
                while((it == null || !it.hasNext()) && resources != null && resources.hasNext()){
                    if(r != null){
                        IOUtils.closeQuietly(r.is);
                    }
                    r = resources.next();
                    try {
                        it = r.getEntries();
                    } catch (IOException e) {
                        log.error("Unable to read Resource '"+r.getName()+"' because of "+e.getMessage(),e);
                        e.printStackTrace();
                        IOUtils.closeQuietly(r.is);
                        it = null;
                    }
                    resources.remove();
                }
                if(it != null && it.hasNext()){
                    return it.nextLine();
                } else {
                    return null;
                }
            }
            
            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }
            
            @Override
            public String next() {
                if(next == null){
                    next = getNext();
                }
                if(next == null){
                    throw new NoSuchElementException();
                } else {
                    rep = processGeonameEntry(next);
                    next = null;
                    return rep.getId();
                }
            }
            
            @Override
            public boolean hasNext() {
                if(next == null){
                    next = getNext();
                }
                return next != null;
            }
            
            @Override
            public Representation getRepresentation() {
                return rep;
            }
            
            @Override
            public void close() {
                if(r != null){
                    IOUtils.closeQuietly(r.is);
                }
                next = null;
                it = null;
                resources = null;
            }
            /**
             * Parses the Representation from the current line.<p>
             * NOTE: this does not process alternate labels and also does not
             * lookup entities for parent codes. Those things are done now by
             * own EntityProcessors
             * @param line the line to process
             * @return the representation
             */
            private Representation processGeonameEntry(String line){
                LineTokenizer t = new LineTokenizer(line);
                //[0] geonames id
                String id = t.next();
                Integer geoNamesId = Integer.parseInt(id);
                //create a new Doc based on the first Element (geonamesID)
                Representation doc = valueFactory.createRepresentation(
                    new StringBuilder(GEONAMES_RESOURCE_NS).append(id).toString());
                //add the Integer id so that we do not need to parse it from the subject URI
                doc.add(GeonamesPropertyEnum.idx_id.toString(), geoNamesId);
                //add the geonames:Feature type
                doc.add(GeonamesPropertyEnum.rdf_type.toString(), getReference(GeonamesPropertyEnum.gn_Feature.toString()));
                //[1] UTF-8 name
                String utf8Label = t.next();
                //[2] ASKII Name as rdfs:label
                String askiiLabel = t.next();
                if(utf8Label == null){
                    utf8Label = askiiLabel; //use ASKII label as fallback for the utf8 version
                }
                doc.addNaturalText(GeonamesPropertyEnum.gn_name.toString(),utf8Label);
                //[3] Alternate Names
                t.next(); //alternate names are added later during processing
                //addAlternateNames(geoNamesId, doc);
                //[4] lat
                doc.add(GeonamesPropertyEnum.geo_lat.toString(),new BigDecimal(t.next()));
                //[5] lon
                doc.add(GeonamesPropertyEnum.geo_long.toString(),new BigDecimal(t.next()));
                //[6] featureClass
                String featureClass = new StringBuilder(GEONAMES_ONTOLOGY_NS).append(t.next()).toString();
                doc.add(GeonamesPropertyEnum.gn_featureClass.toString(),getReference(featureClass));
                //[7] featureCode (-> need to use <featureClass>.<featureCode>!!)
                doc.add(GeonamesPropertyEnum.gn_featureCode.toString(),getReference(
                    new StringBuilder(featureClass).append(t.next()).toString()));
                //countryCode
                //  -> geonames uses here the link to an HTML Page showing the Country
                //     We would like to use an Link to a SKOS:Concept representing the Country
                // ... But luckily here we need only to add the URI!
                Set<String> ccs = new HashSet<String>();
                //[8] countryCode
                String countryCode = t.next();
                if(countryCode != null){
                    countryCode = countryCode.trim(); //need to trim because some country codes use '  ' to indicate null!
                    if(countryCode.length() == 2){ //Yes there are some features that are in no country!
                        ccs.add(countryCode);
                    }
                }
                //[9] alternate countryCodes
                String altCc = t.next();
                if(altCc != null){
                    StringTokenizer altCcT = new StringTokenizer(altCc,",");
                    while(altCcT.hasMoreElements()){
                        countryCode = altCcT.nextToken();
                        if(countryCode.length() ==2){
                            ccs.add(countryCode);
                        }
                    }
                }
                if(!ccs.isEmpty()){
                    doc.add(GeonamesPropertyEnum.gn_countryCode.toString(),ccs);
                }
                //[10 TO 13] Admin codes
                //first read them -> we need to consume the tokens anyway
                String[] adminCodes = new String[] {
                    countryCode, //country
                    t.next(), //ADM1
                    t.next(), //ADM2
                    t.next(), //ADM3
                    t.next()};//ADM4
                //Workaround for Admin1 -> add leading '0' for single Value
                if(adminCodes[1] != null && adminCodes[1].length() < 2){
                    adminCodes[1] = '0'+adminCodes[1];
                }
                //now process the admin Codes (including the country at index 0)
                for(int i=0;i<adminCodes.length;i++){
                    if(adminCodes[i] != null && !adminCodes[i].equals("00")){ //00 is used to indicate not known
                        StringBuilder parentCode = new StringBuilder();
                        for(int j=0;j<i;j++){
                            parentCode.append(adminCodes[j]); //add all the previous
                            parentCode.append('.'); //add the seperator char
                        }
                        parentCode.append(adminCodes[i]);//add the current (last) Element
                        String property = i==0 ? GeonamesPropertyEnum.idx_CC.toString() :
                            new StringBuilder(GeonamesPropertyEnum.idx_ADM.toString()).append(i).toString();
                        doc.add(property, parentCode.toString());
                    }
                }


                //[14] population
                String populationString = t.next();
                if(populationString != null){
                    //NOTE: we need to used Long, because of Asia (3.800.000)
                    Long population = new Long(populationString);
                    if(population.intValue() > 0){
                        doc.add(GeonamesPropertyEnum.gn_population.toString(),population);
                    }
                }
                
                //[15 TO 16] elevation and gtopo30
                String altString = t.next();
                if(altString == null){
                    altString = t.next(); //if no elevation than use the gtopo30
                } else {
                    t.next(); //if there is already en elevation, than consume these entry
                }
                Integer alt = Integer.valueOf(altString);
                if(alt.intValue() > -9999){ //it looks like that -9999 is sometimes used as not known!
                    doc.add(GeonamesPropertyEnum.geo_alt.toString(),alt);
                }
                
                //[17] time zone
                t.next(); //not used
                //[18] mod-date
                String modDateString = t.next();
                if(modDateString != null){
                    try {
                        doc.add(GeonamesPropertyEnum.dc_date.toString(),TimeUtils.toDate(DataTypeEnum.DateTime, modDateString));
                    }catch (IllegalArgumentException e) {
                        log.warn(String.format("Unable to parse modificationDate for geonamesID %s from value %s",doc.getId(),modDateString));
                    }
                }
                //no creator as this is anyway provided by attribution
                //doc.add(GeonamesPropertyEnum.dc_creator.toString(),"http://www.geonames.org/");
                return doc;
            }
        };
    }
}
Source Code of org.apache.stanbol.entityhub.indexing.geonames.GeonamesIndexingSource

Related Classes of org.apache.stanbol.entityhub.indexing.geonames.GeonamesIndexingSource