Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.TokenStream.addAttribute()


    // and ensure they are the same as the ones we produced in serial fashion.

    for (int i = 0; i < numTestPoints; i++) {
      String term = _TestUtil.randomSimpleString(random);
      TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
      CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      assertTrue(ts.incrementToken());
      // ensure we make a copy of the actual bytes too
      map.put(term, encodedBytes.toString());
    }
View Full Code Here


          try {
            for (Map.Entry<String,String> mapping : map.entrySet()) {
              String term = mapping.getKey();
              String expected = mapping.getValue();
              TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
              CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
              ts.reset();
              assertTrue(ts.incrementToken());
              assertEquals(expected, encodedBytes.toString());
            }
          } catch (IOException e) {
View Full Code Here

    try {
      TokenStream ts = analyzer.tokenStream(label, reader);
      writer.write(label);
      writer.write('\t'); // edit: Inorder to match Hadoop standard
      // TextInputFormat
      TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
      while (ts.incrementToken()) {
        char[] termBuffer = termAtt.termBuffer();
        int termLen = termAtt.termLength();
        writer.write(termBuffer, 0, termLen);
        writer.write(' ');
View Full Code Here

   */
  public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException {
    TokenStream ts = analyzer.tokenStream("", reader);
   
    List<String> coll = new ArrayList<String>();
    TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
      char[] termBuffer = termAtt.termBuffer();
      int termLen = termAtt.termLength();
      String val = new String(termBuffer, 0, termLen);
      coll.add(val);
View Full Code Here

    if (!catMatch.equals("Unknown")) {
      document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN.matcher(
        WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst(""))
          .replaceAll(""));
      TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
      TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
      while (stream.incrementToken()) {
        contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
      }
      output.collect(new Text(WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch)
          .replaceAll("_")), new Text(contents.toString()));
View Full Code Here

 
  @Override
  public void map(Text key, Text value,
                  OutputCollector<Text,StringTuple> output, Reporter reporter) throws IOException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
      if (termAtt.termLength() > 0) {
        document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
      }
View Full Code Here

        List<String> tokens = new ArrayList<String>();
        TokenStream stream = null;
        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);
View Full Code Here

        try {
            stream = analyzer.tokenStream(FieldNames.FULLTEXT,
                    new StringReader(text));
            CharTermAttribute termAtt = stream
                    .addAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAtt = stream
                    .addAttribute(OffsetAttribute.class);
            // TypeAttribute type = stream.addAttribute(TypeAttribute.class);

            stream.reset();
View Full Code Here

            throws IOException {

        TokenStream stream = analyzer.tokenStream("contents",
                new StringReader(text));

        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream
                .addAttribute(PositionIncrementAttribute.class);

        int position = 0;
        while (stream.incrementToken()) {
View Full Code Here

        TokenStream stream = analyzer.tokenStream("contents",
                new StringReader(text));

        CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncr = stream
                .addAttribute(PositionIncrementAttribute.class);

        int position = 0;
        while (stream.incrementToken()) {
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.