Examples of SMOTEplus

org.apache.ctakes.temporal.utils.SMOTEplus
A simple implementation of SMOTE algorithm. Nitesh V. Shawla et. al. SMOTE: Synthetic Minority Over-sampling Technique, 06/02 Find K nearest neighbor for each minority instance. @author Chen Lin @DCT : 12/28/2012 Modified on 1/4/2013

Examples of org.apache.ctakes.temporal.utils.SMOTEplus

  }


  @Override
  public void process(JCas jCas, Segment segment) throws AnalysisEngineProcessException {
    //TRY SMOTE algorithm here to generate more minority class samples
      SMOTEplus smote = new SMOTEplus((int)Math.ceil(this.smoteNumOfNeighbors));
      
    // classify tokens within each sentence
    for (Sentence sentence : JCasUtil.selectCovered(jCas, Sentence.class, segment)) {
      List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
      
      // during training, the list of all outcomes for the tokens
      List<String> outcomes;
      if (this.isTraining()) {
        List<TimeMention> times = JCasUtil.selectCovered(jCas, TimeMention.class, sentence);
        outcomes = this.timeChunking.createOutcomes(jCas, tokens, times);
      }
      // during prediction, the list of outcomes predicted so far
      else {
        outcomes = new ArrayList<String>();
      }


      // extract features for all tokens
      int tokenIndex = -1;
      for (BaseToken token : tokens) {
        ++tokenIndex;


        List<Feature> features = new ArrayList<Feature>();
        // features from token attributes
        for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
          features.addAll(extractor.extract(jCas, token));
        }
        // features from surrounding tokens
        for (CleartkExtractor extractor : this.contextFeatureExtractors) {
          features.addAll(extractor.extractWithin(jCas, token, sentence));
        }
        // features from previous classifications
        int nPreviousClassifications = 2;
        for (int i = nPreviousClassifications; i > 0; --i) {
          int index = tokenIndex - i;
          String previousOutcome = index < 0 ? "O" : outcomes.get(index);
          features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
        }
        //add segment ID as a features:
        features.add(new Feature("SegmentID", segment.getId()));


        // features from dominating parse tree
        //        for(SimpleFeatureExtractor extractor : this.parseFeatureExtractors){
        BaseToken startToken = token;
        for(int i = tokenIndex-1; i >= 0; --i){
          String outcome = outcomes.get(i);
          if(outcome.equals("O")){
            break;
          }
          startToken = tokens.get(i);
        }
        features.addAll(parseExtractor.extract(jCas, startToken.getBegin(), token.getEnd()));
        //        }
        
        // apply feature selection, if necessary
            if (this.featureSelection != null) {
              features = this.featureSelection.transform(features);
            }
        
        // if training, write to data file
            if (this.isTraining()) {
              String outcome = outcomes.get(tokenIndex);
              // if it is an "O" down-sample it
              if (outcome.equals("O")) {
                this.dataWriter.write(new Instance<String>(outcome, features));


              }else{//for minority instances:
                Instance<String> minorityInst = new Instance<String>(outcome, features);
                this.dataWriter.write(minorityInst);
                smote.addInstance(minorityInst);//add minority instances to SMOTE algorithm
              }
            }else {// if predicting, add prediction to outcomes
              outcomes.add(this.classifier.classify(features));
            }
      }


      // during prediction, convert chunk labels to times and add them to the CAS
      if (!this.isTraining()) {
        JCas timexCas;
        try {
          timexCas = jCas.getView(timexView);
        } catch (CASException e) {
          throw new AnalysisEngineProcessException(e);
        }
        this.timeChunking.createChunks(timexCas, tokens, outcomes);
      }
    }
    if(this.isTraining() && this.smoteNumOfNeighbors >= 1){ //add synthetic instances to datawriter, if smote is selected
        Iterable<Instance<String>> syntheticInsts = smote.populateMinorityClass();
        for( Instance<String> sytheticInst: syntheticInsts){
          this.dataWriter.write(sytheticInst);
        }
      }
  }

View Full Code Here

Examples of org.apache.ctakes.temporal.utils.SMOTEplus

    }


    Random rand = new Random();
    
    //TRY SMOTE algorithm here to generate more minority class samples
    SMOTEplus smote = new SMOTEplus((int)Math.ceil(this.smoteNumOfNeighbors));
        
    // classify tokens within each sentence
    for (Sentence sentence : JCasUtil.selectCovered(jCas, Sentence.class, segment)) {
      List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);


      // during training, the list of all outcomes for the tokens
      List<String> outcomes;
      if (this.isTraining()) {
        List<EventMention> events = Lists.newArrayList();
        for (EventMention event : JCasUtil.selectCovered(jCas, EventMention.class, sentence)) {
          if (event.getClass().equals(EventMention.class)) {
            events.add(event);
          }
        }
        outcomes = this.eventChunking.createOutcomes(jCas, tokens, events);
      }
      // during prediction, the list of outcomes predicted so far
      else {
        outcomes = new ArrayList<String>();
      }


      // get BIO entity tags for each entity type
      int[] entityTypeIDs = new int[] {
          CONST.NE_TYPE_ID_ANATOMICAL_SITE,
          CONST.NE_TYPE_ID_DISORDER,
          CONST.NE_TYPE_ID_DRUG,
          CONST.NE_TYPE_ID_FINDING,
          CONST.NE_TYPE_ID_PROCEDURE,
          CONST.NE_TYPE_ID_UNKNOWN };
      List<IdentifiedAnnotation> entities;
      if (this.isTraining()) {
        entities = Lists.newArrayList();
        for (IdentifiedAnnotation entity : JCasUtil.selectCovered(jCas, IdentifiedAnnotation.class, sentence)) {
          if (!entity.getClass().equals(EventMention.class)) {
            entities.add(entity);
          }
        }
      } else {
        entities = JCasUtil.selectCovered(jCas, IdentifiedAnnotation.class, sentence);
      }
      
      List<ChunkingExtractor> chunkingExtractors = Lists.newArrayList(); 
      for (int typeID : entityTypeIDs) {
        Predicate<IdentifiedAnnotation> hasTypeID = hasEntityType(typeID);
        List<IdentifiedAnnotation> subEntities = Lists.newArrayList(Iterables.filter(entities, hasTypeID));
        chunkingExtractors.add(new ChunkingExtractor("EntityTag", this.entityChunking, jCas, tokens, subEntities));
      }
      
      // add extractor for phase chunks
      List<Chunk> chunks = JCasUtil.selectCovered(jCas, Chunk.class, sentence);
      chunkingExtractors.add(new ChunkingExtractor("PhraseTag", this.phraseChunking, jCas, tokens, chunks));


      // extract features for all tokens
      int tokenIndex = -1;
      int nChunkLabelsBefore = 2;
      int nChunkLabelsAfter = 2;
      int nPreviousClassifications = 2;


      for (BaseToken token : tokens) {
        ++tokenIndex;


        List<Feature> features = new ArrayList<Feature>();


        // features from previous classifications
        for (int i = nPreviousClassifications; i > 0; --i) {
          int index = tokenIndex - i;
          String previousOutcome = index < 0 ? "O" : outcomes.get(index);
          features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
        }
        
        // features from token attributes
        features.addAll(this.tokenFeatureExtractor.extract(jCas, token));


        // features from surrounding tokens
        features.addAll(this.contextFeatureExtractor.extractWithin(jCas, token, sentence));
        
        // features from ends of entities
        features.addAll(endOfEntityFeatures.get(token));


        // features from surrounding entity, phrase, etc. chunk-labels
        for (ChunkingExtractor extractor : chunkingExtractors) {
          features.addAll(extractor.extract(tokenIndex, nChunkLabelsBefore, nChunkLabelsAfter));
        }
        
        // features from semantic roles
        features.addAll(predicateArgumentExtractor.extract(token));


        // apply feature selection, if necessary
        if (this.featureSelection != null) {
          features = this.featureSelection.transform(features);
        }


        // if training, write to data file
        if (this.isTraining()) {
          String outcome = outcomes.get(tokenIndex);
          // if it is an "O" down-sample it
          if (outcome.equals("O")) {
            if (rand.nextDouble() <= this.probabilityOfKeepingANegativeExample){
              this.dataWriter.write(new Instance<String>(outcome, features));
            }      
          }else{//for minority instances:
            Instance<String> minorityInst = new Instance<String>(outcome, features);
            this.dataWriter.write(minorityInst);
            smote.addInstance(minorityInst);//add minority instances to SMOTE algorithm
          }
        }


        // if predicting, add prediction to outcomes
        else {
          outcomes.add(this.classifier.classify(features));
        }
      }


      // during prediction, convert chunk labels to events and add them to the CAS
      if (!this.isTraining()) {
        this.eventChunking.createChunks(jCas, tokens, outcomes);
      }
    }
    if(this.isTraining() && this.smoteNumOfNeighbors >= 1){ //add synthetic instances to datawriter, if smote is selected
      Iterable<Instance<String>> syntheticInsts = smote.populateMinorityClass();
      for( Instance<String> sytheticInst: syntheticInsts){
        this.dataWriter.write(sytheticInst);
      }
    }

View Full Code Here

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.