Package edu.washington.cs.knowitall.extractor

Source Code of edu.washington.cs.knowitall.extractor.RegexExtractor

package edu.washington.cs.knowitall.extractor;

import java.util.ArrayList;
import java.util.Collection;

import edu.washington.cs.knowitall.commonlib.Range;
import edu.washington.cs.knowitall.nlp.ChunkedSentence;
import edu.washington.cs.knowitall.nlp.extraction.ChunkedExtraction;
import edu.washington.cs.knowitall.sequence.LayeredTokenMatcher;
import edu.washington.cs.knowitall.sequence.LayeredTokenPattern;
import edu.washington.cs.knowitall.sequence.SequenceException;

/**
* An extractor that uses a regular expression pattern relations from NP-chunked
* sentences. This class uses a {@link LayeredTokenPattern} object to represent
* the regular expression pattern.
*
* @author afader
*
*/
public class RegexExtractor extends
        Extractor<ChunkedSentence, ChunkedExtraction> {

    private String patternString;
    private LayeredTokenPattern pattern;

    /**
     * Constructs a new instance using the given pattern.
     *
     * @param pattern
     *            the relation pattern
     * @throws SequenceException
     *             if unable to compile pattern
     */
    public RegexExtractor(String patternString) throws SequenceException {
        this.patternString = patternString;
        this.pattern = new LayeredTokenPattern(patternString);
    }

    /**
     * @return the String relation pattern.
     */
    public String getPatternString() {
        return patternString;
    }

    /**
     * @return the <code>LayeredTokenPattern</code> used to extract relations.
     */
    public LayeredTokenPattern getPattern() {
        return pattern;
    }

    @Override
    /**
     * Extracts relations matching the regular expression.
     */
    public Collection<ChunkedExtraction> extractCandidates(
            ChunkedSentence sentence) throws ExtractorException {

        try {

            LayeredTokenMatcher m = pattern.matcher(sentence);
            Collection<ChunkedExtraction> results = new ArrayList<ChunkedExtraction>();

            while (m.find()) {
                int start = m.start();
                int length = m.end() - start;
                Range r = new Range(start, length);
                ChunkedExtraction extr = new ChunkedExtraction(sentence, r);
                results.add(extr);
            }
            return results;

        } catch (SequenceException e) {
            String msg = String.format("Couldn't extract from sentence '%s'",
                    sentence);
            throw new ExtractorException(msg, e);
        }
    }

}
TOP

Related Classes of edu.washington.cs.knowitall.extractor.RegexExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.