Package edu.stanford.nlp.pipeline

Examples of edu.stanford.nlp.pipeline.Annotation


   */
  public static List<Annotation> getAnnotations(StanfordCoreNLP tokenizer, Input inputFormat, String filename, boolean filterUnknown) {
    switch (inputFormat) {
    case TEXT: {
      String text = IOUtils.slurpFileNoExceptions(filename);
      Annotation annotation = new Annotation(text);
      tokenizer.annotate(annotation);
      List<Annotation> annotations = Generics.newArrayList();
      for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        Annotation nextAnnotation = new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class));
        nextAnnotation.set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
        annotations.add(nextAnnotation);
      }
      return annotations;
    }
    case TREES: {
      List<Tree> trees;
      if (filterUnknown) {
        trees = SentimentUtils.readTreesWithGoldLabels(filename);
        trees = SentimentUtils.filterUnknownRoots(trees);
      } else {
        trees = Generics.newArrayList();
        MemoryTreebank treebank = new MemoryTreebank("utf-8");
        treebank.loadPath(filename, null);
        for (Tree tree : treebank) {
          trees.add(tree);
        }
      }

      List<Annotation> annotations = Generics.newArrayList();
      for (Tree tree : trees) {
        CoreMap sentence = new Annotation(Sentence.listToString(tree.yield()));
        sentence.set(TreeCoreAnnotations.TreeAnnotation.class, tree);
        List<CoreMap> sentences = Collections.singletonList(sentence);
        Annotation annotation = new Annotation("");
        annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
        annotations.add(annotation);
      }
      return annotations;
    }
    default:
View Full Code Here


        if (line == null) {
          break;
        }
        line = line.trim();
        if (line.length() > 0) {
          Annotation annotation = tokenizer.process(line);
          pipeline.annotate(annotation);
          for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
            outputTree(System.out, sentence, outputFormats);
          }
        } else {
          // Output blank lines for blank lines so the tool can be
          // used for line-by-line text processing
View Full Code Here

    CoNLL2011DocumentReader.Document conllDoc = reader.getNextDocument();
    if (conllDoc == null) {
      return null;
    }

    Annotation anno = conllDoc.getAnnotation();
    List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence:sentences) {
      if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) {
        // Remove tree from annotation and replace with parse using stanford parser
        sentence.remove(TreeCoreAnnotations.TreeAnnotation.class);
      } else {
        Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
        if (LEMMATIZE) {
          treeLemmatizer.transformTree(tree);
        }
        // generate the dependency graph
        try {
          SemanticGraph deps = SemanticGraphFactory.makeFromTree(tree,
              SemanticGraphFactory.Mode.COLLAPSED, includeExtras, threadSafe);
          SemanticGraph basicDeps = SemanticGraphFactory.makeFromTree(tree,
              SemanticGraphFactory.Mode.BASIC, includeExtras, threadSafe);
          sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps);
          sentence.set(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class, deps);
        } catch(Exception e) {
          logger.log(Level.WARNING, "Exception caught during extraction of Stanford dependencies. Will ignore and continue...", e);
        }
      }
    }

    String preSpeaker = null;
    int utterance = -1;
    for (CoreLabel token:anno.get(CoreAnnotations.TokensAnnotation.class)) {
      if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class))  {
        token.set(CoreAnnotations.SpeakerAnnotation.class, "");
      }
      String curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class);
      if (!curSpeaker.equals(preSpeaker)) {
        utterance++;
        preSpeaker = curSpeaker;
      }
      token.set(CoreAnnotations.UtteranceAnnotation.class, utterance);
    }

    // Run pipeline
    stanfordProcessor.annotate(anno);

    for (CoreMap sentence:anno.get(CoreAnnotations.SentencesAnnotation.class)) {
      allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class));
      allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // Initialize gold mentions
View Full Code Here

    }

    // make sure token character offsets are aligned with text
    List<CoreLabel> tokenSequence = copyTokens(tokens, adjustCharacterOffsets, false);

    Annotation newSentence = new Annotation(text);
    newSentence.set(CoreAnnotations.TokensAnnotation.class, tokenSequence);
    if (! adjustCharacterOffsets &&
        characterOffsetStart != null &&
        characterOffsetEnd != null){
      newSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, characterOffsetStart);
      newSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, characterOffsetEnd);
    } else {
      int tokenCharStart = tokenSequence.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int tokenCharEnd = tokenSequence.get(tokenSequence.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      newSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokenCharStart);
      newSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokenCharEnd);
    }

    // some default token offsets
    newSentence.set(CoreAnnotations.TokenBeginAnnotation.class, 0);
    newSentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenSequence.size());

    return newSentence;
  }
View Full Code Here

    }
    List<String> retMsg = new ArrayList<String>();
    boolean haveSerializedEntityExtractor = serializedModelExists(MachineReadingProperties.serializedEntityExtractorPath);
    boolean haveSerializedRelationExtractor = serializedModelExists(MachineReadingProperties.serializedRelationExtractorPath);
    boolean haveSerializedEventExtractor = serializedModelExists(MachineReadingProperties.serializedEventExtractorPath);
    Annotation training = null;
    Annotation aux = null;
    if ((MachineReadingProperties.extractEntities && !haveSerializedEntityExtractor) ||
        (MachineReadingProperties.extractRelations && !haveSerializedRelationExtractor) ||
        (MachineReadingProperties.extractEvents && !haveSerializedEventExtractor) ||
        this.forceRetraining|| MachineReadingProperties.crossValidate){
      // load training sentences
      training = loadOrMakeSerializedSentences(MachineReadingProperties.trainPath, reader, new File(MachineReadingProperties.serializedTrainingSentencesPath));
      if (auxReader != null) {
        MachineReadingProperties.logger.severe("Reading auxiliary dataset from " + MachineReadingProperties.auxDataPath + "...");
        aux = loadOrMakeSerializedSentences(MachineReadingProperties.auxDataPath, auxReader, new File(
            MachineReadingProperties.serializedAuxTrainingSentencesPath));
        MachineReadingProperties.logger.severe("Done reading auxiliary dataset.");
      }
    }
   
    Annotation testing = null;
    if (!MachineReadingProperties.trainOnly && !MachineReadingProperties.crossValidate) {
      // load test sentences
      File serializedTestSentences = new File(MachineReadingProperties.serializedTestSentencesPath);
      testing = loadOrMakeSerializedSentences(MachineReadingProperties.testPath, reader, serializedTestSentences);
    }
     
    //
    // create the actual datasets to be used for training and annotation
    //
    makeDataSets(training, testing, aux);
   
    //
    // process (training + annotate) one partition at a time
    //
    for(int partition = 0; partition < datasets.length; partition ++){
      assert(datasets.length > partition);
      assert(datasets[partition] != null);
      assert(MachineReadingProperties.trainOnly || datasets[partition].second() != null);
     
      // train all models
      train(datasets[partition].first(), (MachineReadingProperties.crossValidate ? partition : -1));
      // annotate using all models
      if(! MachineReadingProperties.trainOnly){
        MachineReadingProperties.logger.info("annotating partition " + partition );
        annotate(datasets[partition].second(), (MachineReadingProperties.crossValidate ? partition: -1));
      }
    }
   
    //
    // now report overall results
    //
    if(! MachineReadingProperties.trainOnly){
      // merge test sets for the gold data
      Annotation gold = new Annotation("");
      for(int i = 0; i < datasets.length; i ++) AnnotationUtils.addSentences(gold, datasets[i].second().get(CoreAnnotations.SentencesAnnotation.class));
     
      // merge test sets with predicted annotations
      Annotation[] mergedPredictions = new Annotation[3];
      assert(predictions != null);
      for (int taskLevel = 0; taskLevel < mergedPredictions.length; taskLevel++) {
        mergedPredictions[taskLevel] = new Annotation("");
        for(int fold = 0; fold < predictions[taskLevel].length; fold ++){
          if (predictions[taskLevel][fold] == null) continue;
          AnnotationUtils.addSentences(mergedPredictions[taskLevel], predictions[taskLevel][fold].get(CoreAnnotations.SentencesAnnotation.class));
        }
      }
View Full Code Here

      if (partition != -1)
        MachineReadingProperties.logger.info("In partition #" + partition);
      String modelName = MachineReadingProperties.serializedRelationExtractorPath;
      if (partition != -1)
        modelName += "." + partition;
      Annotation predicted = null;
     
      if (MachineReadingProperties.useRelationExtractionModelMerging) {
        String[] modelNames = MachineReadingProperties.serializedRelationExtractorPath.split(",");
        if (partition != -1) {
          for (int i = 0; i < modelNames.length; i++) {
            modelNames[i] += "." + partition;
          }
        }
       
        relationExtractor = ExtractorMerger.buildRelationExtractorMerger(modelNames);
      } else if (!this.forceRetraining&& new File(modelName).exists()) {
        MachineReadingProperties.logger.info("Loading relation extraction model from " + modelName + " ...");
        //TODO change this to load any type of BasicRelationExtractor
        relationExtractor = BasicRelationExtractor.load(modelName);
      } else {
        RelationFeatureFactory rff = makeRelationFeatureFactory(MachineReadingProperties.relationFeatureFactoryClass, MachineReadingProperties.relationFeatures, MachineReadingProperties.doNotLexicalizeFirstArg);
        Execution.fillOptions(rff, args);

        if (MachineReadingProperties.trainRelationsUsingPredictedEntities) {
          // generate predicted entities
          assert(entityExtractor != null);
          predicted = AnnotationUtils.deepMentionCopy(training);
          entityExtractor.annotate(predicted);
          for (ResultsPrinter rp : entityResultsPrinterSet){
            String msg = rp.printResults(training, predicted);
            MachineReadingProperties.logger.info("Training relation extraction using predicted entitities: entity scores using printer " + rp.getClass() + ":\n" + msg);
          }
         
          // change relation mentions to use predicted entity mentions rather than gold ones
          try {
            changeGoldRelationArgsToPredicted(predicted);
          } catch (Exception e) {
            // we may get here for unknown EntityMentionComparator class
            throw new RuntimeException(e);
          }
        }

        Annotation dataset;
        if (MachineReadingProperties.trainRelationsUsingPredictedEntities) {
          dataset = predicted;
        } else {
          dataset = training;
        }
       
        Set<String> relationsToSkip = new HashSet<String>(StringUtils.split(MachineReadingProperties.relationsToSkipDuringTraining, ","));
        List<List<RelationMention>> backedUpRelations = new ArrayList<List<RelationMention>>();
        if (relationsToSkip.size() > 0) {
          // we need to backup the relations since removeSkippableRelations modifies dataset in place and we can't duplicate CoreMaps safely (or can we?)
          for (CoreMap sent : dataset.get(CoreAnnotations.SentencesAnnotation.class)) {
            List<RelationMention> relationMentions = sent.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
            backedUpRelations.add(relationMentions);
          }
         
          removeSkippableRelations(dataset, relationsToSkip);
        }
       
        //relationExtractor = new BasicRelationExtractor(rff, MachineReadingProperties.createUnrelatedRelations, makeRelationMentionFactory(MachineReadingProperties.relationMentionFactoryClass));
        relationExtractor = makeRelationExtractor(MachineReadingProperties.relationClassifier, rff, MachineReadingProperties.createUnrelatedRelations,
          makeRelationMentionFactory(MachineReadingProperties.relationMentionFactoryClass));
        Execution.fillOptions(relationExtractor, args);
        //Arguments.parse(args,relationExtractor);
        MachineReadingProperties.logger.info("Training relation extraction model...");
        relationExtractor.train(dataset);
        MachineReadingProperties.logger.info("Serializing relation extraction model to " + modelName + " ...");
        relationExtractor.save(modelName);

        if (relationsToSkip.size() > 0) {
          // restore backed up relations into dataset
          int sentenceIndex = 0;
         
          for (CoreMap sentence : dataset.get(CoreAnnotations.SentencesAnnotation.class)) {
            List<RelationMention> relationMentions = backedUpRelations.get(sentenceIndex);
            sentence.set(MachineReadingAnnotations.RelationMentionsAnnotation.class, relationMentions);
            sentenceIndex++;
          }
        }
      }
    }

    //
    // train event extraction -- currently just works with MSTBasedEventExtractor
    //
    if (MachineReadingProperties.extractEvents) {
      MachineReadingProperties.logger.info("Training event extraction model(s)");
      if (partition != -1) MachineReadingProperties.logger.info("In partition #" + partition);
      String modelName = MachineReadingProperties.serializedEventExtractorPath;
      if (partition != -1) modelName += "." + partition;
      File modelFile = new File(modelName);

      Annotation predicted = null;
      if(!this.forceRetraining&& modelFile.exists()) {
        MachineReadingProperties.logger.info("Loading event extraction model from " + modelName + " ...");
        Method mstLoader = (Class.forName("MSTBasedEventExtractor")).getMethod("load", String.class);
        eventExtractor = (Extractor) mstLoader.invoke(null, modelName);
      } else {
View Full Code Here

    //
    // annotate entities
    //
    if (MachineReadingProperties.extractEntities) {
      assert(entityExtractor != null);
      Annotation predicted = AnnotationUtils.deepMentionCopy(testing);
      entityExtractor.annotate(predicted);
     
      for (ResultsPrinter rp : entityResultsPrinterSet){
        String msg = rp.printResults(testing, predicted);
        MachineReadingProperties.logger.info("Entity extraction results " + (partition != -1 ? "for partition #" + partition : "") + " using printer " + rp.getClass() + ":\n" + msg);
      }
      predictions[ENTITY_LEVEL][partitionIndex] = predicted;
    }
   
    //
    // annotate relations
    //
    if (MachineReadingProperties.extractRelations) {
      assert(relationExtractor != null);
     
      Annotation predicted = (MachineReadingProperties.testRelationsUsingPredictedEntities ? predictions[ENTITY_LEVEL][partitionIndex] : AnnotationUtils.deepMentionCopy(testing));
      // make sure the entities have the syntactic head and span set. we need this for relation extraction features
      assignSyntacticHeadToEntities(predicted);
      relationExtractor.annotate(predicted);
     
      if (relationExtractionPostProcessor == null) {
View Full Code Here

 
  @SuppressWarnings("unchecked")
  protected void makeDataSets(Annotation training, Annotation testing, Annotation auxDataset) {
    if(! MachineReadingProperties.crossValidate){
      datasets = new Pair[1];
      Annotation trainingEnhanced = training;
      if (auxDataset != null) {
        trainingEnhanced = new Annotation(training.get(TextAnnotation.class));
        for(int i = 0; i < AnnotationUtils.sentenceCount(training); i ++){
          AnnotationUtils.addSentence(trainingEnhanced, AnnotationUtils.getSentence(training, i));
        }
        for (int ind = 0; ind < AnnotationUtils.sentenceCount(auxDataset); ind++) {
          AnnotationUtils.addSentence(trainingEnhanced, AnnotationUtils.getSentence(auxDataset, ind));
        }
      }
      datasets[0] = new Pair<Annotation, Annotation>(trainingEnhanced, testing);
     
      predictions = new Annotation[3][1];
    } else {
      assert(MachineReadingProperties.kfold > 1);
      datasets = new Pair[MachineReadingProperties.kfold];
      AnnotationUtils.shuffleSentences(training);
      for (int partition = 0; partition <MachineReadingProperties.kfold; partition++) {
        int begin = AnnotationUtils.sentenceCount(training) * partition / MachineReadingProperties.kfold;
        int end = AnnotationUtils.sentenceCount(training) * (partition + 1) / MachineReadingProperties.kfold;
        MachineReadingProperties.logger.info("Creating partition #" + partition + " using offsets [" + begin + ", " + end + ") out of " + AnnotationUtils.sentenceCount(training));
        Annotation partitionTrain = new Annotation("");
        Annotation partitionTest = new Annotation("");
        for(int i = 0; i < AnnotationUtils.sentenceCount(training); i ++){
          if(i < begin){
            AnnotationUtils.addSentence(partitionTrain, AnnotationUtils.getSentence(training, i));
          } else if(i < end){
            AnnotationUtils.addSentence(partitionTest, AnnotationUtils.getSentence(training, i));
View Full Code Here

  }
 
  /** Keeps only the first percentage sentences from the given corpus */
  static Annotation keepPercentage(Annotation corpus, double percentage) {
    System.err.println("Using percentage of train: " + percentage);
    Annotation smaller = new Annotation("");
    List<CoreMap> sents = new ArrayList<CoreMap>();
    List<CoreMap> fullSents = corpus.get(SentencesAnnotation.class);
    double smallSize = (double) fullSents.size() * percentage;
    for(int i = 0; i < smallSize; i ++){
      sents.add(fullSents.get(i));
    }
    System.err.println("TRAIN corpus size reduced from " + fullSents.size() + " to " + sents.size());
    smaller.set(SentencesAnnotation.class, sents);
    return smaller;
  }
View Full Code Here

   * @return a list of RelationsSentences
   */
  protected Annotation loadOrMakeSerializedSentences(
      String sentencesPath, GenericDataSetReader reader,
      File serializedSentences) throws IOException, ClassNotFoundException {
    Annotation corpusSentences;
    // if the serialized file exists, just read it. otherwise read the source
    // and and save the serialized file to disk
    if (MachineReadingProperties.serializeCorpora && serializedSentences.exists() && !forceParseSentences) {
      MachineReadingProperties.logger.info("Loaded serialized sentences from " + serializedSentences.getAbsolutePath() + "...");
      corpusSentences = (Annotation) IOUtils.readObjectFromFile(serializedSentences);
      MachineReadingProperties.logger.info("Done. Loaded " + corpusSentences.get(CoreAnnotations.SentencesAnnotation.class).size() + " sentences.");
    } else {
      // read the corpus
      MachineReadingProperties.logger.info("Parsing corpus sentences...");
      if(MachineReadingProperties.serializeCorpora)
        MachineReadingProperties.logger.info("These sentences will be serialized to " + serializedSentences.getAbsolutePath());
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.pipeline.Annotation

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.