Package edu.washington.cs.knowitall.argumentidentifier

Source Code of edu.washington.cs.knowitall.argumentidentifier.ArgLearnerFeatureSet

package edu.washington.cs.knowitall.argumentidentifier;

import java.util.ArrayList;
import java.util.List;

import edu.washington.cs.knowitall.commonlib.Range;
import edu.washington.cs.knowitall.extractor.conf.featureset.FeatureSet;
import edu.washington.cs.knowitall.nlp.ChunkedSentence;
import edu.washington.cs.knowitall.nlp.extraction.ChunkedArgumentExtraction;
import edu.washington.cs.knowitall.nlp.extraction.ChunkedBinaryExtraction;
import edu.washington.cs.knowitall.nlp.extraction.ChunkedExtraction;

public class ArgLearnerFeatureSet extends FeatureSet<ChunkedBinaryExtraction> {
    private final PatternExtractor patternExtractor;

    public ArgLearnerFeatureSet() {
        super(featureNames());
        this.patternExtractor = new PatternExtractor();
    }

    public static List<String> featureNames() {
        List<String> names = new ArrayList<String>(34);
        names.add("correct_end");
        names.add("pred_starts_w_np");
        names.add("to_before_pred");

        names.add("conj_before_rel");
        names.add("which_before_rel");
        names.add("rel_one_verb");
        names.add("rel_to");
        names.add("rel_for");
        names.add("rel_in");
        names.add("rel_of");
        names.add("rel_on");

        names.add("pp_before_arg1");
        names.add("words_till_start");
        names.add("arg1_conf");
        names.add("arg1_proper");
        names.add("np_before_arg1");
        names.add("arg1_length");

        names.add("adj");
        names.add("comp");
        names.add("nest1");
        names.add("nest2");
        names.add("rel");
        names.add("npinf");
        names.add("doublenp");

        names.add("arg2_proper");
        names.add("verb_after_arg2");
        names.add("np_after_arg2");
        names.add("pp_after_arg2");
        names.add("words_till_end");
        names.add("arg2_conf");

        names.add("sent_less_than_10");
        names.add("sent_less_than_20");
        names.add("sent_more_than_20");
        names.add("extr_covers_phrase");

        return names;
    }

    private double toDouble(boolean bool) {
        // of course, weka featurizes booleans inversely
        if (bool) {
            return 0.0;
        }
        else {
            return 1.0;
        }
    }

    private double toDouble(int num) {
        return 0.0 + num;
    }

    private double toDouble(double num) {
        return num;
    }

    @Override
    public double featurize(String featureName, ChunkedBinaryExtraction extr) {
        // relation metrics
        if (featureName.equals("pred_starts_w_np")) {
            return toDouble(predStartsWithNP(extr));
        } else if (featureName.equals("to_before_pred")) {
            return toDouble(toBeforePred(extr));
        } else if (featureName.equals("conj_before_rel")) {
            return toDouble(conjBeforeRel(extr));
        } else if (featureName.equals("which_before_rel")) {
            return toDouble(whichBeforeRel(extr));
        } else if (featureName.equals("rel_one_verb")) {
            return toDouble(relOneVerb(extr));
        } else if (featureName.equals("rel_to")) {
            return toDouble(relEndsWithToken(extr, "to"));
        } else if (featureName.equals("rel_for")) {
            return toDouble(relEndsWithToken(extr, "for"));
        } else if (featureName.equals("rel_in")) {
            return toDouble(relEndsWithToken(extr, "in"));
        } else if (featureName.equals("rel_of")) {
            return toDouble(relEndsWithToken(extr, "of"));
        } else if (featureName.equals("rel_on")) {
            return toDouble(relEndsWithToken(extr, "on"));
        }

        // arg1 metrics
        else if (featureName.equals("correct_end")) {
            return toDouble(correctArg1End(extr));
        } else if (featureName.equals("pp_before_arg1")) {
            return toDouble(ppBeforeArg1(extr));
        } else if (featureName.equals("words_till_start")) {
            return toDouble(wordsTillStart(extr));
        } else if (featureName.equals("arg1_conf")) {
            return toDouble(extr.getArgument1().getConfidence());
        } else if (featureName.equals("arg1_proper")) {
            return toDouble(arg1IsProper(extr));
        } else if (featureName.equals("np_before_arg1")) {
            return toDouble(npBeforeArg1(extr));
        } else if (featureName.equals("arg1_length")) {
            return toDouble(arg1Length(extr));
        }

        // arg2 metrics
        else if (featureName.equals("adj")) {
            return toDouble(patternExtractor.adjRelation(extr.getRelation()));
        } else if (featureName.equals("comp")) {
            return toDouble(patternExtractor.complementClause(extr
                    .getRelation()));
        } else if (featureName.equals("nest1")) {
            return toDouble(patternExtractor
                    .nestedRelation1(extr.getRelation()));
        } else if (featureName.equals("nest2")) {
            return toDouble(patternExtractor
                    .nestedRelation2(extr.getRelation()));
        } else if (featureName.equals("rel")) {
            return toDouble(patternExtractor.npRelativeClause(extr
                    .getRelation()));
        } else if (featureName.equals("npinf")) {
            return toDouble(patternExtractor.npInfinitiveClause(extr
                    .getRelation()));
        } else if (featureName.equals("doublenp")) {
            return toDouble(patternExtractor.doubleNP(extr.getRelation()));
        } else if (featureName.equals("arg2_proper")) {
            return toDouble(arg2IsProper(extr));
        } else if (featureName.equals("verb_after_arg2")) {
            return toDouble(verbAfterArg2(extr));
        } else if (featureName.equals("np_after_arg2")) {
            return toDouble(npAfterArg2(extr));
        } else if (featureName.equals("pp_after_arg2")) {
            return toDouble(ppAfterArg2(extr));
        } else if (featureName.equals("words_till_end")) {
            return toDouble(wordsTillStart(extr));
        } else if (featureName.equals("arg2_conf")) {
            return toDouble(extr.getArgument2().getConfidence());
        }

        // sentence metric
        else if (featureName.equals("sent_less_than_10")) {
            return toDouble(sentLength(extr, 0, 11));
        } else if (featureName.equals("sent_less_than_20")) {
            return toDouble(sentLength(extr, 11, 21));
        } else if (featureName.equals("sent_more_than_20")) {
            return toDouble(sentLength(extr, 21, Integer.MAX_VALUE));
        } else if (featureName.equals("extr_covers_phrase")) {
            return toDouble(extrCoversPhrase(extr));
        }
        else {
            throw new IllegalArgumentException();
        }
    }

    public int getIntValue(boolean bool, boolean dir) {
        if (bool == dir) {
            return 1;
        } else {
            return 0;
        }
    }

    private boolean extrCoversPhrase(ChunkedBinaryExtraction e) {

        ChunkedSentence sent = e.getSentence();
        List<String> tokens = sent.getTokens();

        Range x = e.getArgument1().getRange();
        Range y = e.getArgument2().getRange();
        Range r = e.getRelation().getRange();
        boolean adj = x.isAdjacentTo(r) && r.isAdjacentTo(y);

        int xs = x.getStart();
        boolean leftOk = xs == 0 || tokens.get(xs - 1).equals(",")
                || tokens.get(xs - 1).equals(".");

        int l = sent.getLength() - 1;
        int yr = y.getLastIndex();
        boolean rightOk = yr == l || tokens.get(yr + 1).equals(",")
                || tokens.get(yr + 1).equals(".");

        return adj && leftOk && rightOk;
    }

    private boolean sentLength(ChunkedBinaryExtraction e, int lower, int upper) {
        final int a = lower;
        final int b = upper;
        ChunkedSentence sent = e.getSentence();
        int len = sent.getLength();
        return a <= len && len < b;
    }

    private boolean npAfterArg2(ChunkedBinaryExtraction e) {
        ChunkedArgumentExtraction arg2 = e.getArgument2();
        int lastArg2 = arg2.getRange().getLastIndex();
        ChunkedSentence sent = arg2.getSentence();
        return lastArg2 + 1 < sent.getLength()
                && sent.getChunkTags().get(lastArg2 + 1).equals("B-NP");
    }

    private boolean verbAfterArg2(ChunkedBinaryExtraction e) {
        ChunkedArgumentExtraction arg2 = e.getArgument2();
        int pastArg2 = arg2.getStart() + arg2.getLength();
        if (pastArg2 < e.getSentence().getLength()) {
            String pastPosTag = e.getSentence().getPosTags().get(pastArg2);
            if (pastPosTag.equals("MD") || pastPosTag.startsWith("V")) {
                return true;
            }
        }
        return false;
    }

    private boolean npBeforeArg1(ChunkedBinaryExtraction e) {
        ChunkedExtraction arg1 = e.getArgument1();
        int start = arg1.getRange().getStart();
        if (start == 0) {
            return false;
        } else {
            ChunkedSentence sent = arg1.getSentence();
            return sent.getChunkTags().get(start - 1).endsWith("-NP");
        }

    }

    private boolean arg1IsProper(ChunkedBinaryExtraction e) {
        return isProperNp(e.getArgument2());
    }

    private boolean arg2IsProper(ChunkedBinaryExtraction e) {
        return isProperNp(e.getArgument2());
    }

    private boolean isProperNp(ChunkedExtraction e) {
        for (String tag : e.getPosTags()) {
            if (!tag.startsWith("NNP") && !tag.equals("DT")
                    && !tag.equals("IN")) {
                return false;
            }
        }
        return true;
    }

    private boolean relEndsWithToken(ChunkedBinaryExtraction e, String t) {
        final String token = t;
        List<String> tokens = e.getRelation().getTokens();
        return tokens.get(tokens.size() - 1).equals(token);
    }

    private boolean relOneVerb(ChunkedBinaryExtraction e) {
        ChunkedExtraction rel = e.getRelation();
        List<String> posTags = rel.getPosTags();
        return posTags.size() == 1 && posTags.get(0).startsWith("V");
    }

    private boolean whichBeforeRel(ChunkedBinaryExtraction e) {
        ChunkedExtraction pred = e.getRelation();
        int predStart = pred.getStart();
        if (predStart > 0) {
            String precPosTag = e.getSentence().getPosTags().get(predStart - 1);
            String precPosToken = e.getSentence().getToken(predStart - 1);
            if (precPosTag.equals("WP") || precPosTag.equals("WDT")
                    || precPosToken.equals("that")) {
                return true;
            }
        }
        return false;
    }

    private boolean conjBeforeRel(ChunkedBinaryExtraction e) {
        ChunkedExtraction pred = e.getRelation();
        int predStart = pred.getStart();
        if (predStart > 0) {
            String precPosTag = e.getSentence().getPosTags().get(predStart - 1);
            if (precPosTag.equals("CC")) {
                return true;
            }
        }
        return false;
    }

    public int wordsTillStart(ChunkedBinaryExtraction extr) {
        if (extr.getArgument1() == null || extr.getArgument1().getLength() < 1) {
            return -1;
        }
        int words_till_start = extr.getArgument1().getStart();
        return words_till_start;
    }

    public int arg1Length(ChunkedBinaryExtraction extr) {
        if (extr.getArgument1() == null || extr.getArgument1().getLength() < 1) {
            return -1;
        }
        return extr.getArgument1().getLength();
    }

    public int wordsTillEnd(ChunkedBinaryExtraction extr) {
        if (extr.getArgument2() == null || extr.getArgument2().getLength() < 1) {
            return -1;
        }
        int words_till_end = extr.getSentence().getLength()
                - (extr.getArgument2().getStart() + extr.getArgument2()
                        .getLength());
        return words_till_end;
    }

    public boolean ppAfterArg2(ChunkedBinaryExtraction extr) {
        if (wordsTillEnd(extr) > 0) {
            int end = extr.getArgument2().getStart()
                    + extr.getArgument2().getLength();
            if (extr.getSentence().getChunkTag(end).equals("B-PP")) {
                return true;
            }
        }
        return false;
    }

    public boolean correctArg1End(ChunkedBinaryExtraction extr) {
        int i = extr.getArgument1().getStart();
        int start = i;
        while (i < extr.getArgument1().getStart()
                + extr.getArgument1().getLength()) {
            if (extr.getSentence().getChunkTag(i).equals("B-NP")) {
                start = i;
            }
            i++;
        }
        return patternExtractor.findSubj(extr.getRelation(), start);
    }

    public boolean ppBeforeArg1(ChunkedBinaryExtraction extr) {
        if (wordsTillStart(extr) > 0) {
            int start = extr.getArgument1().getStart();
            if (extr.getSentence().getChunkTag(start - 1).equals("B-PP")) {
                return true;
            }
        }
        return false;
    }

    public boolean npBeforeArg2(ChunkedBinaryExtraction extr, boolean train) {
        for (int i = extr.getRelation().getStart()
                + extr.getRelation().getLength(); i < extr.getArgument2()
                .getStart(); i++) {
            if (extr.getSentence().getChunkTag(i).contains("NP")) {
                return true;
            }
        }
        return false;
    }

    public boolean predStartsWithNP(ChunkedBinaryExtraction extr) {
        // check that the relation is in a vp
        if (extr.getSentence().getPosTag(extr.getRelation().getStart())
                .contains("N")) {
            return true;
        }
        return false;
    }

    public boolean toInPred(ChunkedBinaryExtraction extr, boolean train) {
        // check for to in current pred
        if (train) {
            for (int i = extr.getRelation().getStart(); i < extr.getArgument2()
                    .getStart(); i++) {
                if (extr.getSentence().getChunkTag(i).equals("B-VP")
                        && extr.getSentence().getPosTag(i).equals("TO")) {
                    return true;
                }
            }
        }
        return false;
    }

    public boolean toBeforePred(ChunkedBinaryExtraction extr) {
        // find appropriate vp;
        boolean in_conj = false;
        int i = extr.getRelation().getStart() - 1;
        while (i > -1) {
            if (extr.getSentence().getPosTag(i).equals("CC")
                    || extr.getSentence().getPosTag(i).equals(",")) {
                in_conj = true;
                break;
            }
            if (extr.getSentence().getChunkTag(i).contains("NP")) {
                break;
            }
            i--;
        }
        int last_vp = extr.getRelation().getStart();
        if (in_conj) {
            boolean seen_vp = false;
            while (i > -1) {
                if (extr.getSentence().getPosTag(i).equals("CC")
                        || extr.getSentence().getPosTag(i).equals(",")) {
                    seen_vp = false;
                } else if (extr.getSentence().getChunkTag(i).equals("B-VP")) {
                    seen_vp = true;
                    last_vp = i;
                } else if ((extr.getSentence().getChunkTag(i).equals("B-NP") || extr
                        .getSentence().getChunkTag(i).equals("I-NP"))
                        && seen_vp) {
                    break;
                }
                if (extr.getSentence().getChunkTag(i).equals("B-VP")
                        && extr.getSentence().getPosTag(i).equals("TO")) {
                    return true;
                }
                i--;
            }
        }
        // check for a to
        i = last_vp;
        boolean foundnp = false;
        while (i > -1) {
            if (extr.getSentence().getToken(i).equals("to")) {
                return true;
            }
            if (extr.getSentence().getChunkTag(i).equals("B-NP")
                    || extr.getSentence().getChunkTag(i).equals("I-NP")) {
                foundnp = true;
                break;
            }
            i--;
        }

        if (foundnp) {
            return false;
        }

        return false;
    }
}
TOP

Related Classes of edu.washington.cs.knowitall.argumentidentifier.ArgLearnerFeatureSet

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.