Package cc.mallet.types

Examples of cc.mallet.types.TokenSequence


  public Instance pipe (Instance carrier)
  {
    CharSequence string = (CharSequence) carrier.getData();
    StringTokenization dataTokens = new StringTokenization (string);
    TokenSequence targetTokens = new TokenSequence ();
    String tag = backgroundTag;
    String nextTag = backgroundTag;
    Matcher m = sgmlPattern.matcher (string);
    int textStart = 0;
    int textEnd = 0;
    int nextStart = 0;
    boolean done = false;

    logger.fine(sgmlPattern.pattern());
    logger.finer(string.toString());

    while (!done) {
      done = !(m.find());
      if (done)
        textEnd = string.length(); // culotta: changed from string.length()-1
      else {
        String sgml = m.group();
        logger.finer ("SGML = "+sgml);

        int groupCount = m.groupCount();
        logger.finer(Integer.toString (groupCount));

        if (sgml.charAt(1) == '/')
          nextTag = backgroundTag;
        else{
          //nextTag = m.group(0);
          nextTag = sgml.substring(1, sgml.length()-1);
        }
        logger.finer("nextTag: " + nextTag);

        nextStart = m.end()// m.end returns one beyond index of last match char
        textEnd = m.start()// String.subtring does not include index end
        logger.finer ("Text start/end "+textStart+" "+textEnd);
      }
      if (textEnd - textStart > 0) {
        logger.finer ("Tag = "+tag);
        logger.finer ("Target = "+string.subSequence (textStart, textEnd));
        lexer.setCharSequence (string.subSequence (textStart, textEnd));
        while (lexer.hasNext()) {
          lexer.next ();
          int tokStart = textStart + lexer.getStartOffset ();
          int tokEnd = textStart + lexer.getEndOffset ();
          dataTokens.add (new StringSpan (string, tokStart, tokEnd));
          targetTokens.add (new Token (tag));
        }
      }
      textStart = nextStart;
      tag = nextTag;
    }
View Full Code Here


//        new SGML2TokenSequence (new CharSequenceLexer (Pattern.compile (".")), "O")
        });

      for (int i = 0; i < args.length; i++) {
        Instance carrier = p.instanceFrom(new Instance (new File(args[i]), null, null, null));
        TokenSequence data = (TokenSequence) carrier.getData();
        TokenSequence target = (TokenSequence) carrier.getTarget();
        logger.finer ("===");
        logger.info (args[i]);
        for (int j = 0; j < data.size(); j++)
          logger.info (target.get(j).getText()+" "+data.get(j).getText());
      }
    } catch (Exception e) {
      System.out.println (e);
      e.printStackTrace();
    }
View Full Code Here

public class TokenSequenceDocHeader extends Pipe implements Serializable
{
 
  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    if (ts.size() > 3
        && (ts.get(2).getText().equals("-") || ts.get(3).getText().equals("-"))
        && ts.get(1).getText().matches("[A-Z]+")) {
      String header = ts.get(1).getText();
      if (header.equals("PRESS"))        // Don't bother with "PRESS DIGEST" headers
        return carrier;
      String featureName = "HEADER="+header;
      for (int i = 0; i < ts.size(); i++) {
        Token t = ts.get(i);
        // Only apply this feature to capitalized words, because if we apply it to everything
        // we easily get an immense number of possible feature conjunctions, (e.g. every word
        // with each of these HEADER= features.
        if (t.getText().matches("^[A-Z].*"))
          t.setFeatureValue (featureName, 1.0);
View Full Code Here

  }

  public Instance pipe (Instance carrier)
  {
    if (carrier.getData() instanceof CharSequence)
      carrier.setData(new TokenSequence (ngramify ((CharSequence)carrier.getData())));
    else if (carrier.getData() instanceof TokenSequence) {
      TokenSequence ts = (TokenSequence) carrier.getData();
      TokenSequence ret = new TokenSequence ();
      for (int i = 0; i < ts.size(); i++)
        ret.add (ngramify (ts.get(i).getText()).toString());
      carrier.setData(ret);
    } else
      throw new IllegalArgumentException ("Unhandled type "+carrier.getData().getClass());
    return carrier;
  }
View Full Code Here

    Object data = carrier.getData ();
    if (data instanceof Tokenization) {
      // we're done
    } else if (data instanceof TokenSequence) {
      StringBuffer buf = new StringBuffer ();
      TokenSequence ts = (TokenSequence) data;
      StringTokenization spans = new StringTokenization (buf)// I can use a StringBuffer as the doc! Awesome!

      for (int i = 0; i < ts.size(); i++) {
        Token token = ts.get(i);

        int start = buf.length ();
        buf.append (token.getText());
        int end = buf.length();
View Full Code Here

    int[] numTrueSegments, numPredictedSegments, numCorrectSegments;
    int allIndex = segmentStartTags.length;
    numTrueSegments = new int[allIndex+1];
    numPredictedSegments = new int[allIndex+1];
    numCorrectSegments = new int[allIndex+1];
    TokenSequence sourceTokenSequence = null;

    totalTokens = numCorrectTokens = 0;
    for (int n = 0; n < numTrueSegments.length; n++)
      numTrueSegments[n] = numPredictedSegments[n] = numCorrectSegments[n] = 0;
    for (int i = 0; i < data.size(); i++) {
      if (viterbiOutputStream != null)
        viterbiOutputStream.println ("Viterbi path for "+description+" instance #"+i);
      Instance instance = data.get(i);
      Sequence input = (Sequence) instance.getData();     
      //String tokens = null;
      //if (instance.getSource() != null)
      //tokens = (String) instance.getSource().toString();
      Sequence trueOutput = (Sequence) instance.getTarget();
      assert (input.size() == trueOutput.size());
      Sequence predOutput = (Sequence) predictedSequences.get (i);
      if (predOutput == null) // skip this instance
        continue;
      assert (predOutput.size() == trueOutput.size());
      int trueStart, predStart;        // -1 for non-start, otherwise index into segmentStartTag
      for (int j = 0; j < trueOutput.size(); j++) {
        totalTokens++;
        if (trueOutput.get(j).equals(predOutput.get(j)))
          numCorrectTokens++;
        trueStart = predStart = -1;
        // Count true segment starts
        for (int n = 0; n < segmentStartTags.length; n++) {
          if (segmentStartTags[n].equals(trueOutput.get(j))) {
            numTrueSegments[n]++;
            numTrueSegments[allIndex]++;
            trueStart = n;
            break;
          }
        }
        // Count predicted segment starts
        for (int n = 0; n < segmentStartTags.length; n++) {
          if (segmentStartTags[n].equals(predOutput.get(j))) {
            numPredictedSegments[n]++;
            numPredictedSegments[allIndex]++;
            predStart = n;
          }
        }
        if (trueStart != -1 && trueStart == predStart) {
          // Truth and Prediction both agree that the same segment tag-type is starting now
          int m;
          boolean trueContinue = false;
          boolean predContinue = false;
          for (m = j+1; m < trueOutput.size(); m++) {
            trueContinue = segmentContinueTags[predStart].equals (trueOutput.get(m));
            predContinue = segmentContinueTags[predStart].equals (predOutput.get(m));
            if (!trueContinue || !predContinue) {
              if (trueContinue == predContinue) {
                // They agree about a segment is ending somehow
                numCorrectSegments[predStart]++;
                numCorrectSegments[allIndex]++;
              }
              break;
            }
          }
          // for the case of the end of the sequence
          if (m == trueOutput.size()) {
            if (trueContinue == predContinue) {
              numCorrectSegments[predStart]++;
              numCorrectSegments[allIndex]++;
            }
          }
        }

        if (viterbiOutputStream != null) {
          FeatureVector fv = (FeatureVector) input.get(j);
          //viterbiOutputStream.println (tokens.charAt(j)+" "+trueOutput.get(j).toString()+
          //'/'+predOutput.get(j).toString()+"  "+ fv.toString(true));
          if (sourceTokenSequence != null)
            viterbiOutputStream.print (sourceTokenSequence.get(j).getText()+": ");
          viterbiOutputStream.println (trueOutput.get(j).toString()+
                                       '/'+predOutput.get(j).toString()+"  "+ fv.toString(true));
           
        }
      }
View Full Code Here

   
    if (!(carrier.getData() instanceof CharSequence))
      throw new IllegalArgumentException ();
    String s = carrier.getData().toString();
    String[] lines = s.split (System.getProperty ("line.separator"));
    carrier.setData (new TokenSequence (lines));
    return carrier;
  }
View Full Code Here

  }
 
  public Instance pipe (Instance carrier)
  {
    String newTerm = null;
    TokenSequence tmpTS = new TokenSequence();
    TokenSequence ts = (TokenSequence) carrier.getData();

    for (int i = 0; i < ts.size(); i++) {
      Token t = ts.get(i);
      for(int j = 0; j < gramSizes.length; j++) {
        int len = gramSizes[j];
        if (len <= 0 || len > (i+1)) continue;
        if (len == 1) { tmpTS.add(t); continue; }
        newTerm = new String(t.getText());
        for(int k = 1; k < len; k++)
          newTerm = ts.get(i-k).getText() + "_" + newTerm;
        tmpTS.add(newTerm);
      }
    }

    carrier.setData(tmpTS);
View Full Code Here

  public Instance pipe (Instance carrier)
  {
    if (!(carrier.getData() instanceof CharSequence))
      throw new ClassCastException ("carrier.data is a " + carrier.getData().getClass().getName() +
                                   " not a CharSequence");
    TokenSequence dataTokens = new TokenSequence ();
     TokenSequence targetTokens = new TokenSequence ();
    CharSequence string = (CharSequence) carrier.getData();
    String tag = backgroundTag;
    String nextTag = backgroundTag;
    Matcher m = sgmlPattern.matcher (string);
    int textStart = 0;
    int textEnd = 0;
    int nextStart = 0;
    boolean done = false;

    while (!done) {
      done = !findNextValidMatch (m);
      if (done)
        textEnd = string.length()-1;
      else {
        String sgml = m.group();
        int groupCount = m.groupCount();
        if (sgml.charAt(1) == '/')
          nextTag = backgroundTag;
        else{
          nextTag = m.group(0);
          nextTag = sgml.substring(1, sgml.length()-1);
        }
        nextStart = m.end();
        textEnd = m.start();
      }
      if (textEnd - textStart > 0) {
        lexer.setCharSequence (string.subSequence (textStart, textEnd));
        while (lexer.hasNext()) {
          dataTokens.add (new Token ((String) lexer.next()));
          targetTokens.add (new Token (tag));
        }
      }
      textStart = nextStart;
      tag = nextTag;
    }
View Full Code Here

    this (namePrefix, leftBoundaryOffset, rightBoundaryOffset, null, true);
  }
 
  public Instance pipe (Instance carrier)
  {
    TokenSequence ts = (TokenSequence) carrier.getData();
    int tsSize = ts.size();
    PropertyList[] newFeatures = new PropertyList[tsSize];
    for (int i = 0; i < tsSize; i++) {
      Token t = ts.get (i);
      PropertyList pl = t.getFeatures();
      newFeatures[i] = pl;
      for (int position = i + leftBoundary; position < i + rightBoundary; position++) {
        if (position == i && !includeCurrentToken)
          continue;
        PropertyList pl2;
        if (position < 0)
          pl2 = startfs[-position];
        else if (position >= tsSize)
          pl2 = endfs[position-tsSize];
        else
          pl2 = ts.get(position).getFeatures ();
        PropertyList.Iterator pl2i = pl2.iterator();
        while (pl2i.hasNext()) {
          pl2i.next();
          String key = pl2i.getKey();
          if (featureRegex == null || featureRegex.matcher(key).matches()) {
            newFeatures[i] = PropertyList.add ((namePrefixLeft == null || position-i>0 ? namePrefix : namePrefixLeft)+key,
                                               pl2i.getNumericValue(), newFeatures[i]);
          }
        }
      }
    }
    for (int i = 0; i < tsSize; i++) {
      // Put the new PropertyLists in place
      ts.get (i).setFeatures (newFeatures[i]);
    }
    return carrier;
  }
View Full Code Here

TOP

Related Classes of cc.mallet.types.TokenSequence

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.