Package org.dbpedia.spotlight.spot.cooccurrence.training

Source Code of org.dbpedia.spotlight.spot.cooccurrence.training.AnnotatedDatasetEnricherNGram

package org.dbpedia.spotlight.spot.cooccurrence.training;

import org.dbpedia.spotlight.spot.cooccurrence.InstanceBuilderFactory;
import org.dbpedia.spotlight.spot.cooccurrence.features.data.OccurrenceDataProviderSQL;
import org.dbpedia.spotlight.spot.cooccurrence.filter.FilterPattern;
import org.dbpedia.spotlight.spot.cooccurrence.filter.FilterTermsize;
import org.dbpedia.spotlight.exceptions.ConfigurationException;
import org.dbpedia.spotlight.exceptions.InitializationException;
import org.dbpedia.spotlight.model.SpotlightConfiguration;
import org.dbpedia.spotlight.model.SpotlightFactory;
import org.json.JSONException;
import weka.core.Instances;

import java.io.File;
import java.io.IOException;

/**
* Generates training data for an ngram classifier.
*
* {@see AnnotatedDatasetEnricher}
*
* @author Joachim Daiber
*/
public class AnnotatedDatasetEnricherNGram extends AnnotatedDatasetEnricher {


  /**
   * Create a new AnnotatedDatasetEnricher for ngrams.
   *
   * @param configuration Spotlight configuration object
   * @throws ConfigurationException Error in configuration.
   * @throws IOException Error when reading file.
   * @throws InitializationException Error in Initialization.
   */
  public AnnotatedDatasetEnricherNGram(SpotlightConfiguration configuration) throws ConfigurationException, IOException, InitializationException {
    super(configuration);

    OccurrenceDataProviderSQL.initialize(configuration.getSpotterConfiguration());
    dataProvider  = OccurrenceDataProviderSQL.getInstance();

    instanceBuilder = InstanceBuilderFactory.createInstanceBuilderNGram(
        configuration.getSpotterConfiguration().getCoOcSelectorDatasource(), dataProvider);
    instanceBuilder.setVerboseMode(true);

    /** Filter the data set: */
    FilterTermsize filterTermsize = new FilterTermsize(FilterTermsize.Termsize.unigram, spotlightFactory.textUtil());
    filterTermsize.inverse();

    FilterPattern filterPattern = new FilterPattern();

    filters.add(filterTermsize);
    filters.add(filterPattern);

    /** Create a new header object: */
    header = new Instances("NgramTraining", buildAttributeList(), buildAttributeList().size());

  }

 
  public static void main(String[] args) throws ConfigurationException, IOException, JSONException, InitializationException {

    SpotlightConfiguration configuration = new SpotlightConfiguration("conf/server.properties");


    AnnotatedDatasetEnricherNGram annotatedDatasetEnricherNGram = new AnnotatedDatasetEnricherNGram(configuration);
    SpotlightFactory spotlightFactory = new SpotlightFactory(configuration);
   
    AnnotatedDataset trainingData = new AnnotatedDataset(
        new File("/Users/jodaiber/Documents/workspace/ba/BachelorThesis/01 Evaluation/02 Annotation/Software/custom/src/annotation/final.train.json"),
        AnnotatedDataset.Format.JSON, spotlightFactory);
    annotatedDatasetEnricherNGram.writeDatasetXRFF(trainingData, new File("/Users/jodaiber/Desktop/final.ngram.xrff"));

  }



}
TOP

Related Classes of org.dbpedia.spotlight.spot.cooccurrence.training.AnnotatedDatasetEnricherNGram

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.