Examples of org.apache.lucene.util.automaton.RegExp

Package org.apache.lucene.util.automaton

Examples of org.apache.lucene.util.automaton.RegExp

org.apache.lucene.util.automaton.RegExp

Regular Expression extension to Automaton.

Regular expressions are built from the following abstract syntax:

regexp	::=	unionexp
	\|
unionexp	::=	interexp `\|` unionexp	(union)
	\|	interexp
interexp	::=	concatexp `&` interexp	(intersection)	[OPTIONAL]
	\|	concatexp
concatexp	::=	repeatexp concatexp	(concatenation)
	\|	repeatexp
repeatexp	::=	repeatexp `?`	(zero or one occurrence)
	\|	repeatexp `*`	(zero or more occurrences)
	\|	repeatexp `+`	(one or more occurrences)
	\|	repeatexp `{n}`	(`n` occurrences)
	\|	repeatexp `{n,}`	(`n` or more occurrences)
	\|	repeatexp `{n,m}`	(`n` to `m` occurrences, including both)
	\|	complexp
complexp	::=	`~` complexp	(complement)	[OPTIONAL]
	\|	charclassexp
charclassexp	::=	`[` charclasses `]`	(character class)
	\|	`[^` charclasses `]`	(negated character class)
	\|	simpleexp
charclasses	::=	charclass charclasses
	\|	charclass
charclass	::=	charexp `-` charexp	(character range, including end-points)
	\|	charexp
simpleexp	::=	charexp
	\|	`.`	(any single character)
	\|	`#`	(the empty language)	[OPTIONAL]
	\|	`@`	(any string)	[OPTIONAL]
	\|	`"` <Unicode string without double-quotes> `"`	(a string)
	\|	`(` `)`	(the empty string)
	\|	`(` unionexp `)`	(precedence override)
	\|	`<` <identifier> `>`	(named automaton)	[OPTIONAL]
	\|	`<n-m>`	(numerical interval)	[OPTIONAL]
charexp	::=	<Unicode character>	(a single non-reserved character)
	\|	`\` <Unicode character>	(a single character)

The productions marked [OPTIONAL] are only allowed if specified by the syntax flags passed to the RegExp constructor. The reserved characters used in the (enabled) syntax must be escaped with backslash (\) or double-quotes ("..."). (In contrast to other regexp syntaxes, this is required also in character classes.) Be aware that dash (-) has a special meaning in charclass expressions. An identifier is a string not containing right angle bracket (>) or dash (-). Numerical intervals are specified by non-negative decimal integers and include both end points, and if n and m have the same number of digits, then the conforming strings must have that length (i.e. prefixed by 0's). @lucene.experimental


    final Map<String, Analyzer> fieldAnalyzers = new TreeMap<String, Analyzer>();
    fieldAnalyzers.put( "field", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET ) );
    fieldAnalyzers.put( "field_exact", new MockAnalyzer( random() ) );
    fieldAnalyzers.put( "field_super_exact", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, false ) );
    fieldAnalyzers.put( "field_characters", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp(".").toAutomaton() ), true ) );
    fieldAnalyzers.put( "field_tripples", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp("...").toAutomaton() ), true ) );
    fieldAnalyzers.put( "field_sliced", fieldAnalyzers.get( "field" ) );
    fieldAnalyzers.put( "field_der_red", fieldAnalyzers.get( "field" ) );  // This is required even though we provide a token stream
    Analyzer analyzer = new AnalyzerWrapper() {
      public Analyzer getWrappedAnalyzer(String fieldName) {
        return fieldAnalyzers.get( fieldName );

View Full Code Here

  
  public void testMaxSizeEndHighlight() throws Exception {
    TestHighlightRunner helper = new TestHighlightRunner() {
      @Override
      public void run() throws Exception {
        CharacterRunAutomaton stopWords = new CharacterRunAutomaton(new RegExp("i[nt]").toAutomaton());
        TermQuery query = new TermQuery(new Term("text", "searchterm"));


        String text = "this is a text with searchterm in it";
        SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
        Highlighter hg = getHighlighter(query, "text", fm);

View Full Code Here

    assertEquals(two, qp.parse("/foo/ /bar/", df));
  }


  public void testStopwords() throws Exception {
    StandardQueryParser qp = new StandardQueryParser();
    CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
    qp.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));


    Query result = qp.parse("a:the OR a:foo", "a");
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);

View Full Code Here

    assertEquals(two, getQuery("field:/foo/ field:/bar/",qp));
    assertEquals(two, getQuery("/foo/ /bar/",qp));
  }
  
  public void testStopwords() throws Exception {
    CharacterRunAutomaton stopSet = new CharacterRunAutomaton(new RegExp("the|foo").toAutomaton());
    CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet));
    Query result = getQuery("field:the OR field:foo",qp);
    assertNotNull("result is null and it shouldn't be", result);
    assertTrue("result is not a BooleanQuery", result instanceof BooleanQuery);
    assertTrue(((BooleanQuery) result).clauses().size() + " does not equal: " + 0, ((BooleanQuery) result).clauses().size() == 0);

View Full Code Here

    setDefaultField(oldDefaultField);
  }


  public void testPhraseQueryPositionIncrements() throws Exception {
    CharacterRunAutomaton stopStopList =
    new CharacterRunAutomaton(new RegExp("[sS][tT][oO][pP]").toAutomaton());


    CommonQueryParserConfiguration qp = getParserConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));


    qp = getParserConfig(
                         new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false, stopStopList));

View Full Code Here

    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    w.close();
    AtomicReader sub = getOnlySegmentReader(r);
    Terms terms = sub.fields().terms("field");
    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();    
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);    
    TermsEnum te = terms.intersect(ca, null);
    assertEquals("aaa", te.next().utf8ToString());
    assertEquals(0, te.docs(null, null, DocsEnum.FLAG_NONE).nextDoc());
    assertEquals("bbb", te.next().utf8ToString());

View Full Code Here

    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    w.close();
    AtomicReader sub = getOnlySegmentReader(r);
    Terms terms = sub.fields().terms("field");
    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();    
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);    
    TermsEnum te = terms.intersect(ca, null);
    assertEquals("aaa", te.next().utf8ToString());
    assertEquals(0, te.docs(null, null, DocsEnum.FLAG_NONE).nextDoc());
    assertEquals("bbb", te.next().utf8ToString());

View Full Code Here

    DirectoryReader r = w.getReader();
    w.close();
    AtomicReader sub = getOnlySegmentReader(r);
    Terms terms = sub.fields().terms("field");


    Automaton automaton = new RegExp(".*d", RegExp.NONE).toAutomaton();
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);    
    TermsEnum te;
    
    // should seek to startTerm
    te = terms.intersect(ca, new BytesRef("aad"));

View Full Code Here

    DirectoryReader r = w.getReader();
    w.close();
    AtomicReader sub = getOnlySegmentReader(r);
    Terms terms = sub.fields().terms("field");


    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();  // accept ALL
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);    


    TermsEnum te = terms.intersect(ca, null);
    DocsEnum de;

View Full Code Here


    final Map<String, Analyzer> fieldAnalyzers = new TreeMap<String, Analyzer>();
    fieldAnalyzers.put( "field", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET ) );
    fieldAnalyzers.put( "field_exact", new MockAnalyzer( random() ) );
    fieldAnalyzers.put( "field_super_exact", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, false ) );
    fieldAnalyzers.put( "field_characters", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp(".").toAutomaton() ), true ) );
    fieldAnalyzers.put( "field_tripples", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp("...").toAutomaton() ), true ) );
    fieldAnalyzers.put( "field_sliced", fieldAnalyzers.get( "field" ) );
    fieldAnalyzers.put( "field_der_red", fieldAnalyzers.get( "field" ) );  // This is required even though we provide a token stream
    Analyzer analyzer = new AnalyzerWrapper() {
      public Analyzer getWrappedAnalyzer(String fieldName) {
        return fieldAnalyzers.get( fieldName );

View Full Code Here

0 1 2 3 4 5 6 7 8

TOP

Related Classes of org.apache.lucene.util.automaton.RegExp

io.crate.operation.operator.RegexpMatchOperator

org.apache.lucene.analysis.hunspell.Dictionary

org.apache.lucene.analysis.TestMockAnalyzer

org.apache.lucene.codecs.lucene41.TestBlockPostingsFormat3

org.apache.lucene.index.TestDuelingCodecs

org.apache.lucene.index.TestTermsEnum

org.apache.lucene.queryparser.flexible.standard.TestQPHelper

org.apache.lucene.queryparser.util.QueryParserTestBase

org.apache.lucene.search.highlight.HighlighterTest

org.apache.lucene.search.RegexpQuery

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.