Examples of SnowballAnalyzer

org.apache.lucene.analysis.snowball.SnowballAnalyzer
Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}. Available stemmers are listed in org.tartarus.snowball.ext. The name of a stemmer is the part of the class name before "Stemmer", e.g., the stemmer in {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer

                
                Set<String> returnSet = new HashSet<String>();
                
                try {
                        
                        Analyzer analyzer =  new SnowballAnalyzer(
                                        org.apache.lucene.util.Version.LUCENE_CURRENT,
                                        "Spanish",
                                        SPANISH_STOP_WORDS);


                        
                        TokenStream tokenStream = analyzer.tokenStream(
                                        "content", 
                                        new StringReader(indexCleanedOfHTMLTags));
                        
                        Token token = new Token();

View Full Code Here

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer

        analyzerProviderFactories.put("whitespace", new PreBuiltAnalyzerProviderFactory("whitespace", AnalyzerScope.INDICES, new WhitespaceAnalyzer(Lucene.ANALYZER_VERSION)));
        analyzerProviderFactories.put("simple", new PreBuiltAnalyzerProviderFactory("simple", AnalyzerScope.INDICES, new SimpleAnalyzer(Lucene.ANALYZER_VERSION)));


        // extended ones
        analyzerProviderFactories.put("pattern", new PreBuiltAnalyzerProviderFactory("pattern", AnalyzerScope.INDICES, new PatternAnalyzer(Lucene.ANALYZER_VERSION, Regex.compile("\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/, null), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
        analyzerProviderFactories.put("snowball", new PreBuiltAnalyzerProviderFactory("snowball", AnalyzerScope.INDICES, new SnowballAnalyzer(Lucene.ANALYZER_VERSION, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET)));
        analyzerProviderFactories.put("standard_html_strip", new PreBuiltAnalyzerProviderFactory("standard_html_strip", AnalyzerScope.INDICES, new StandardHtmlStripAnalyzer(Lucene.ANALYZER_VERSION)));


        analyzerProviderFactories.put("arabic", new PreBuiltAnalyzerProviderFactory("arabic", AnalyzerScope.INDICES, new ArabicAnalyzer(Lucene.ANALYZER_VERSION)));
        analyzerProviderFactories.put("armenian", new PreBuiltAnalyzerProviderFactory("armenian", AnalyzerScope.INDICES, new ArmenianAnalyzer(Lucene.ANALYZER_VERSION)));
        analyzerProviderFactories.put("basque", new PreBuiltAnalyzerProviderFactory("basque", AnalyzerScope.INDICES, new BasqueAnalyzer(Lucene.ANALYZER_VERSION)));

View Full Code Here

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer


        String language = settings.get("language", settings.get("name", "English"));
        Set<?> defaultStopwords = defaultLanguageStopwords.containsKey(language) ? defaultLanguageStopwords.get(language) : ImmutableSet.<Set<?>>of();
        Set<?> stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);


        analyzer = new SnowballAnalyzer(version, language, stopWords);
    }

View Full Code Here

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer

    IndexReader ir;
    try 
    {
      ir = IndexReader.open(FSDirectory.open(nsdl_index_dir), true);
      //IndexSearcher searcher = new IndexSearcher(ir);
      SnowballAnalyzer analyzer = new SnowballAnalyzer(SRM.VERSION , "Porter" , SRM.stopWords);
      TokenStream ts ;
      TermAttribute termAtt;
      
      // Query data
      doc = (Document)ir.document(testDocId);
      ts = analyzer.tokenStream("title", new StringReader(doc.get("title")));
      termAtt = ts.addAttribute(TermAttribute.class);  
      while (ts.incrementToken())
      {
        rTitle.add(termAtt.term());
      }
      
      ts = analyzer.tokenStream("content", new StringReader(doc.get("content")));
      termAtt = ts.addAttribute(TermAttribute.class);  
      while (ts.incrementToken())
      {
        rContent.add(termAtt.term());
      }
      
      ts = analyzer.tokenStream("desc", new StringReader(doc.get("desc")));
      termAtt = ts.addAttribute(TermAttribute.class);  
      while (ts.incrementToken())
      {
        rDesc.add(termAtt.term());
      }
      
         // Iterate over all training records to find the score of train , test document pair
      Iterator <Integer> trainIterator = trainDocIds.iterator();
      while (trainIterator.hasNext())
      { 
        int docId = (Integer)trainIterator.next();
        doc = (Document)ir.document(docId);
                
        //********************** Title Similarity Score ***************
        ts = analyzer.tokenStream("title", new StringReader(doc.get("title")));
        termAtt = ts.addAttribute(TermAttribute.class);    
        
        // Construct a HashMap of Train record title
        titleMap = new HashMap<String , Integer>();
        titleNI = 0;
        
        while(ts.incrementToken())
        {
          tempToken = termAtt.term();
          if (tempToken.length() > 2)
          {
            titleNI++;
            if (titleMap.containsKey(tempToken))
            {
              titleMap.put(tempToken, titleMap.get(tempToken) + 1);
            }
            else
              titleMap.put(tempToken, 1);
          }
        }
        // Iterate over query title set to find similarity score
        iterator = rTitle.iterator();
        titleSimScore = 0.0;
        
        while(iterator.hasNext())
        {
          tempToken = iterator.next();
          if (titleMap.containsKey(tempToken))
          {
              titleSimScore += ((double)titleMap.get(tempToken) + (100 * titleVocabMap.get(tempToken)/titleLength))/(titleNI + 100);
          }
        }
        
        //********************** Description Similarity Score ****************
        ts = analyzer.tokenStream("desc", new StringReader(doc.get("desc")));
        termAtt = ts.addAttribute(TermAttribute.class);  
                
        // Construct a HashMap of Train record description
        descMap = new HashMap<String , Integer>();
        descNI = 0;
        
        while(ts.incrementToken())
        {
          
          tempToken = termAtt.term();
          if (tempToken.length() > 2)
          {
            descNI++;
            if (descMap.containsKey(tempToken))
            {
              descMap.put(tempToken, descMap.get(tempToken) + 1);
            }
            else
              descMap.put(tempToken, 1);
          }
        }
                
        /// Iterate over query description set to find similarity score
        iterator = rDesc.iterator();
        descSimScore = 0.0;
        
        while(iterator.hasNext())
        {
          tempToken = iterator.next();
          if (descMap.containsKey(tempToken))
          {
            descSimScore += ((double)descMap.get(tempToken) + (100 * descVocabMap.get(tempToken))/descLength)/(descNI + 100);
          }
        }
        
        //********************** Content Similarity Score ****************
        ts = analyzer.tokenStream("content", new StringReader(doc.get("content")));
        termAtt = ts.addAttribute(TermAttribute.class);  
                
        // Construct a HashMap of Train record content
        contentMap = new HashMap<String , Integer>();
        contentNI = 0;

View Full Code Here

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer


    System.out.println("Total Number of Documents: " + allDocIds.size());
    System.out.println("Total Number of Training Documents: " + trainDocIds.size());
    System.out.println("Total Number of Testing Documents: " + testDocIds.size());
    
    SnowballAnalyzer analyzer = new SnowballAnalyzer(SRM.VERSION , "Porter" , SRM.stopWords);
    TokenStream ts ;
    TermAttribute termAtt;
    String tempToken;
    
    Iterator trainIterator = trainDocIds.iterator();
    while (trainIterator.hasNext())
    {
      doc = (Document)ir.document((Integer)trainIterator.next());
      ts = analyzer.tokenStream("title", new StringReader(doc.get("title")));
      termAtt = ts.addAttribute(TermAttribute.class);    
      
      while(ts.incrementToken())
      {
        tempToken = termAtt.term();
        if (tempToken.length() > 2)
        {
          titleLength++;
          if (titleVocabMap.containsKey(tempToken))
          {
            titleVocabMap.put(tempToken, titleVocabMap.get(tempToken) + 1);
          }
          else
            titleVocabMap.put(tempToken, 1);
        }
      }
      
      ts = analyzer.tokenStream("content", new StringReader(doc.get("content")));
      termAtt = ts.addAttribute(TermAttribute.class);    
      
      while(ts.incrementToken())
      {
        tempToken = termAtt.term();
        if (tempToken.length() > 2)
        {
          contentLength++;
          tempToken = termAtt.term();
          if (contentVocabMap.containsKey(tempToken))
          {
            contentVocabMap.put(tempToken, contentVocabMap.get(tempToken) + 1);
          }
          else
            contentVocabMap.put(tempToken, 1);
        }
      }
      
      ts = analyzer.tokenStream("desc", new StringReader(doc.get("desc")));
      termAtt = ts.addAttribute(TermAttribute.class);    
      
      while(ts.incrementToken())
      {
        tempToken = termAtt.term();

View Full Code Here

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer

    IndexReader ir;
    try 
    {
      ir = IndexReader.open(FSDirectory.open(nsdl_index_dir), true);
      //IndexSearcher searcher = new IndexSearcher(ir);
      SnowballAnalyzer analyzer = new SnowballAnalyzer(SRM.VERSION , "Porter" , SRM.stopWords);
      TokenStream ts ;
      TermAttribute termAtt;
      
      // Query data
      doc = (Document)ir.document(testDocId);
      ts = analyzer.tokenStream("audience", new StringReader(doc.get("audience")));
      termAtt = ts.addAttribute(TermAttribute.class);  
      while (ts.incrementToken())
      {
        rAudience.add(termAtt.term());
      }
      
      ts = analyzer.tokenStream("subject", new StringReader(doc.get("subject")));
      termAtt = ts.addAttribute(TermAttribute.class);  
      while (ts.incrementToken())
      {
        rSubject.add(termAtt.term());
      }
      
      ts = analyzer.tokenStream("educationLevel", new StringReader(doc.get("educationLevel")));
      termAtt = ts.addAttribute(TermAttribute.class);  
      while (ts.incrementToken())
      {
        rEduLevel.add(termAtt.term());
      }
            
      for (int i = 0 ; i < K  ; i++)
      {
        int docId = (Integer)kArray[i];
        doc = (Document)ir.document(docId);
        
        ts = analyzer.tokenStream("audience", new StringReader(doc.get("audience")));
        termAtt = ts.addAttribute(TermAttribute.class);  
        
        
        while(ts.incrementToken())
        {
          tempToken = termAtt.term();
          if (tempToken.length() > 2 && rAudience.contains(tempToken))
          {
            aPrecision++;
            break;
          }
        }
        
        ts = analyzer.tokenStream("subject", new StringReader(doc.get("subject")));
        termAtt = ts.addAttribute(TermAttribute.class);  
        
        while(ts.incrementToken())
        {
          tempToken = termAtt.term();
          if (tempToken.length() > 2 && rSubject.contains(tempToken))
          {
            sPrecision++;
            break;
          }
        }
        
        ts = analyzer.tokenStream("educationLevel", new StringReader(doc.get("educationLevel")));
        termAtt = ts.addAttribute(TermAttribute.class);  
        while(ts.incrementToken())
        {
          tempToken = termAtt.term();
          if (tempToken.length() > 2 && rEduLevel.contains(tempToken))

View Full Code Here

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer

        else if(language.equals("danish"))
          analyzer= new DanishAnalyzer();
        else if(language.equals("norwegian"))
          analyzer= new NorwegianAnalyzer();
        else if(language.equals("finnish"))
          analyzer= new SnowballAnalyzer( "Finnish" );
        else if(language.equals("swedish"))
          analyzer= new SnowballAnalyzer( "Swedish" );
        
        
        else {
          String clazzName="org.apache.lucene.analysis.el."+StringUtil.ucFirst(language.trim().toLowerCase())+"Analyzer;";
          Object o=ClassUtil.loadInstance(clazzName,(Object)null);

View Full Code Here

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer


  /**
   * Creates new instance of SpanishAnalyzer
   */
  public SpanishAnalyzer() {
    analyzer = new SnowballAnalyzer("Spanish", SPANISH_STOP_WORDS);
  }

View Full Code Here

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer

  public SpanishAnalyzer() {
    analyzer = new SnowballAnalyzer("Spanish", SPANISH_STOP_WORDS);
  }


  public SpanishAnalyzer(String stopWords[]) {
    analyzer = new SnowballAnalyzer("Spanish", stopWords);
  }

View Full Code Here

Examples of org.apache.lucene.analysis.snowball.SnowballAnalyzer


  /**
   * Creates new instance of SpanishAnalyzer
   */
  public PortugueseAnalyzer() {
    analyzer = new SnowballAnalyzer("Portuguese", PORTUGUESE_STOP_WORDS);
  }

View Full Code Here

0 1

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.