Package cc.mallet.types

Examples of cc.mallet.types.TokenSequence


  {
  }
 
  public Instance pipe (Instance carrier)
  {
    carrier.setData(new TokenSequence((CharSequence[]) carrier.getData()));
    return carrier;
  }
View Full Code Here


  public Instance pipe (Instance carrier)
  {
    CharSequence string = (CharSequence) carrier.getData();
    StringTokenization dataTokens = new StringTokenization (string);
    TokenSequence targetTokens = new TokenSequence ();
    String tag = backgroundTag;
    String nextTag = backgroundTag;
    Matcher m = sgmlPattern.matcher (string);
    int textStart = 0;
    int textEnd = 0;
    int nextStart = 0;
    boolean done = false;

    logger.fine(sgmlPattern.pattern());
    logger.finer(string.toString());

    while (!done) {
      done = !(m.find());
      if (done)
        textEnd = string.length(); // culotta: changed from string.length()-1
      else {
        String sgml = m.group();
        logger.finer ("SGML = "+sgml);

        int groupCount = m.groupCount();
        logger.finer(Integer.toString (groupCount));

        if (sgml.charAt(1) == '/')
          nextTag = backgroundTag;
        else{
          //nextTag = m.group(0);
          nextTag = sgml.substring(1, sgml.length()-1);
        }
        logger.finer("nextTag: " + nextTag);

        nextStart = m.end()// m.end returns one beyond index of last match char
        textEnd = m.start()// String.subtring does not include index end
        logger.finer ("Text start/end "+textStart+" "+textEnd);
      }
      if (textEnd - textStart > 0) {
        logger.finer ("Tag = "+tag);
        logger.finer ("Target = "+string.subSequence (textStart, textEnd));
        lexer.setCharSequence (string.subSequence (textStart, textEnd));
        while (lexer.hasNext()) {
          lexer.next ();
          int tokStart = textStart + lexer.getStartOffset ();
          int tokEnd = textStart + lexer.getEndOffset ();
          dataTokens.add (new StringSpan (string, tokStart, tokEnd));
          targetTokens.add (new Token (tag));
        }
      }
      textStart = nextStart;
      tag = nextTag;
    }
View Full Code Here

//        new SGML2TokenSequence (new CharSequenceLexer (Pattern.compile (".")), "O")
        });

      for (int i = 0; i < args.length; i++) {
        Instance carrier = p.instanceFrom(new Instance (new File(args[i]), null, null, null));
        TokenSequence data = (TokenSequence) carrier.getData();
        TokenSequence target = (TokenSequence) carrier.getTarget();
        logger.finer ("===");
        logger.info (args[i]);
        for (int j = 0; j < data.size(); j++)
          logger.info (target.get(j).getText()+" "+data.get(j).getText());
      }
    } catch (Exception e) {
      System.out.println (e);
      e.printStackTrace();
    }
View Full Code Here

    int[] numTrueSegments, numPredictedSegments, numCorrectSegments;
    int allIndex = segmentStartTags.length;
    numTrueSegments = new int[allIndex+1];
    numPredictedSegments = new int[allIndex+1];
    numCorrectSegments = new int[allIndex+1];
    TokenSequence sourceTokenSequence = null;

    totalTokens = numCorrectTokens = 0;
    for (int n = 0; n < numTrueSegments.length; n++)
      numTrueSegments[n] = numPredictedSegments[n] = numCorrectSegments[n] = 0;
    for (int i = 0; i < data.size(); i++) {
      if (viterbiOutputStream != null)
        viterbiOutputStream.println ("Viterbi path for "+description+" instance #"+i);
      Instance instance = data.get(i);
      Sequence input = (Sequence) instance.getData();     
      //String tokens = null;
      //if (instance.getSource() != null)
      //tokens = (String) instance.getSource().toString();
      Sequence trueOutput = (Sequence) instance.getTarget();
      assert (input.size() == trueOutput.size());
      Sequence predOutput = (Sequence) predictedSequences.get (i);
      if (predOutput == null) // skip this instance
        continue;
      assert (predOutput.size() == trueOutput.size());
      int trueStart, predStart;        // -1 for non-start, otherwise index into segmentStartTag
      for (int j = 0; j < trueOutput.size(); j++) {
        totalTokens++;
        if (trueOutput.get(j).equals(predOutput.get(j)))
          numCorrectTokens++;
        trueStart = predStart = -1;
        // Count true segment starts
        for (int n = 0; n < segmentStartTags.length; n++) {
          if (segmentStartTags[n].equals(trueOutput.get(j))) {
            numTrueSegments[n]++;
            numTrueSegments[allIndex]++;
            trueStart = n;
            break;
          }
        }
        // Count predicted segment starts
        for (int n = 0; n < segmentStartTags.length; n++) {
          if (segmentStartTags[n].equals(predOutput.get(j))) {
            numPredictedSegments[n]++;
            numPredictedSegments[allIndex]++;
            predStart = n;
          }
        }
        if (trueStart != -1 && trueStart == predStart) {
          // Truth and Prediction both agree that the same segment tag-type is starting now
          int m;
          boolean trueContinue = false;
          boolean predContinue = false;
          for (m = j+1; m < trueOutput.size(); m++) {
            trueContinue = segmentContinueTags[predStart].equals (trueOutput.get(m));
            predContinue = segmentContinueTags[predStart].equals (predOutput.get(m));
            if (!trueContinue || !predContinue) {
              if (trueContinue == predContinue) {
                // They agree about a segment is ending somehow
                numCorrectSegments[predStart]++;
                numCorrectSegments[allIndex]++;
              }
              break;
            }
          }
          // for the case of the end of the sequence
          if (m == trueOutput.size()) {
            if (trueContinue == predContinue) {
              numCorrectSegments[predStart]++;
              numCorrectSegments[allIndex]++;
            }
          }
        }

        if (viterbiOutputStream != null) {
          FeatureVector fv = (FeatureVector) input.get(j);
          //viterbiOutputStream.println (tokens.charAt(j)+" "+trueOutput.get(j).toString()+
          //'/'+predOutput.get(j).toString()+"  "+ fv.toString(true));
          if (sourceTokenSequence != null)
            viterbiOutputStream.print (sourceTokenSequence.get(j).getText()+": ");
          viterbiOutputStream.println (trueOutput.get(j).toString()+
                                       '/'+predOutput.get(j).toString()+"  "+ fv.toString(true));
           
        }
      }
View Full Code Here

  }

        @Override
  public Instance pipe (Instance carrier) {
            String newTerm = null;
            TokenSequence tmpTS = new TokenSequence();
            TokenSequence ts = (TokenSequence) carrier.getData();

            for (int i = 0; i < ts.size(); i++) {
                Token t = ts.get(i);
                for(int j = 0; j < gramSizes.length; j++) {
                    int len = gramSizes[j];
                    if (len <= 0 || len > (i+1)) continue;
                    if (len == 1) { tmpTS.add(t); continue; }
                    newTerm = new String(t.getText());
                    for(int k = 1; k < len; k++)
                        newTerm = ts.get(i-k).getText() + delim + newTerm;
                    tmpTS.add(newTerm);
                }
            }
            carrier.setData(tmpTS);
            return carrier;
View Full Code Here

TOP

Related Classes of cc.mallet.types.TokenSequence

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.