Package cc.mallet.pipe

Examples of cc.mallet.pipe.CharSequence2TokenSequence


    }
  }

  private Pipe makeSpacePredictionPipe() {
    Pipe p = new SerialPipes(new Pipe[] {
        new CharSequence2TokenSequence("."),
        new TokenSequenceLowercase(),
        new TestCRFTokenSequenceRemoveSpaces(),
        new TokenText(),
        new OffsetConjunctions(true, new int[][] { { 0 }, { 1 },
            { -1, 0 },
View Full Code Here


  }

  public void testPrint() {
    Pipe p = new SerialPipes(new Pipe[] {
        new CharSequence2TokenSequence("."), new TokenText(),
        new TestCRFTokenSequenceRemoveSpaces(),
        new TokenSequence2FeatureVectorSequence(),
        new PrintInputAndTarget(), });
    InstanceList one = new InstanceList(p);
    String[] data = new String[] { "ABCDE", };
View Full Code Here

    crf.print();
  }

  public void testCopyStatesAndWeights() {
    Pipe p = new SerialPipes(new Pipe[] {
        new CharSequence2TokenSequence("."), new TokenText(),
        new TestCRFTokenSequenceRemoveSpaces(),
        new TokenSequence2FeatureVectorSequence(),
        new PrintInputAndTarget(), });
    InstanceList one = new InstanceList(p);
    String[] data = new String[] { "ABCDE", };
View Full Code Here


  public static  Pipe makeSpacePredictionPipe ()
  {
    Pipe p = new SerialPipes(new Pipe[]{
      new CharSequence2TokenSequence("."),
      new TokenSequenceLowercase(),
      new TestMEMMTokenSequenceRemoveSpaces(),
      new TokenText(),
      new OffsetConjunctions(true,
                             new int[][]{//{0}, /*{1},{-1,0},{0,1}, */
 
View Full Code Here

  }

  public void disabledtestPrint ()
  {
    Pipe p = new SerialPipes (new Pipe[] {
       new CharSequence2TokenSequence("."),
       new TokenText(),
       new TestMEMM.TestMEMMTokenSequenceRemoveSpaces(),
       new TokenSequence2FeatureVectorSequence(),
       new PrintInputAndTarget(),
    });
View Full Code Here

    ArrayList<Pipe> pipeList = new ArrayList<Pipe>();

    // Pipes: lowercase, tokenize, remove stopwords, map to features
    pipeList.add( new CharSequenceLowercase() );
    //word format by Regular expression
    pipeList.add( new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")) );
    pipeList.add( new TokenSequenceRemoveStopwords(new File("stoplist/en.txt"), "UTF-8", false, false, false) );
    //add bigram words
    //pipeList.add(new TokenSequenceNGrams(new int[] {2} ));
       
    //convert to feature
View Full Code Here

    private Pipe buildPipe() {
        Pattern tokenPattern = Pattern.compile("\\S[\\S]+\\S");
        int[] sizes = {1,2};
        ArrayList pipeList = new ArrayList();

        pipeList.add(new CharSequence2TokenSequence(tokenPattern));
        pipeList.add(new TokenSequenceRemoveStopwords(false, false)); // we should use a real stop word list
        pipeList.add(new TokenSequenceNGramsDelim(sizes, " "));
        pipeList.add(new TokenSequence2FeatureSequence());
        return new SerialPipes(pipeList);
    }
View Full Code Here

        //    "[\\p{L}\\p{N}_]+|[\\p{P}]+"   (a group of only letters and numbers OR
        //                                    a group of only punctuation marks)
        Pattern tokenPattern = Pattern.compile("[\\p{L}\\p{N}_]+|[\\p{P}]+");

        // Tokenize raw strings
        pipeList.add(new CharSequence2TokenSequence(tokenPattern));

        // Normalize all tokens to all lowercase
        pipeList.add(new TokenSequenceLowercase());

        // Remove stopwords from a standard English stoplist.
View Full Code Here

        //    "[\\p{L}\\p{N}_]+|[\\p{P}]+"   (a group of only letters and numbers OR
        //                                    a group of only punctuation marks)
        Pattern tokenPattern = Pattern.compile("[\\p{L}\\p{N}_]+|[\\p{P}]+");

        // Tokenize raw strings
        pipeList.add(new CharSequence2TokenSequence(tokenPattern));

        // Normalize all tokens to all lowercase
        pipeList.add(new TokenSequenceLowercase());

        // Remove stopwords from a standard English stoplist.
View Full Code Here

TOP

Related Classes of cc.mallet.pipe.CharSequence2TokenSequence

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.