Package uk.ac.cam.ch.wwmm.oscar3.recogniser.document

Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.Token


      }
      prevCell = cell;
    }
    //System.out.println("**************************");
    // Parenthetical phrases
    Token beforeBracket = null;
    Token prevToken = null;
    for(Token token : tokSeq.getTokens()) {
      if(prevToken == null) {
        prevToken = token;
        continue;
      }
      if(beforeBracket == null) {
        if(token.getValue().equals("(")) beforeBracket = prevToken;
      } else if(token.getValue().equals(")")) {
        LatticeCell beforeBracketCell = tokensToTokenCells.get(beforeBracket);
        LatticeCell bracketCell = tokensToTokenCells.get(token);
        if(beforeBracketCell != null || bracketCell != null) {
          beforeBracketCell.addInheritance(bracketCell);
          //System.out.println("Parenthetical: " + tokSeq.getSubstring(beforeBracket.getId(), token.getId()));
        }
        beforeBracket = null;
      }
      prevToken = token;
    }
   
    // Build in named entities
    for(Token token : tokSeq.getTokens()) {
      if(startsToNEs.containsKey(token.getStart())) {
        for(Element neElem : startsToNEs.get(token.getStart())) {
          Token endToken = procDoc.getTokenByEnd(neElem.getAttributeValue("to"));
          if(endToken == null) continue;
          prevCell = tokenToPrev.get(token);
          LatticeCell endTokenCell = tokensToTokenCells.get(endToken);
          if(endTokenCell == null) continue;
          LatticeCell neCell = new LatticeCell(neElem, endTokenCell);
View Full Code Here


 
  public static List<String> featuresForToken(Token token) {
    List<String> features = new ArrayList<String>();
    for(int i=1;i<=10;i++) {
      int inum = i;
      Token after = token.getNAfter(i);
      if(after != null) features.add("afters" + inum + "=" + st.getStem(after.getValue()));
      if(after != null) features.add("afters" + (inum+1) + "=" + st.getStem(after.getValue()));
      //if(after != null) features.add("after" + inum + "=" + after.getValue());
      //if(after != null) features.add("after" + (inum+1) + "=" + after.getValue());
      if(after != null) features.add("ui=" + after.getValue());
      Token before = token.getNAfter(-i);
      if(before != null) features.add("befores" + inum + "=" + st.getStem(before.getValue()));
      if(before != null) features.add("befores" + (inum+1) + "=" + st.getStem(before.getValue()));
      //if(before != null) features.add("before" + inum + "=" + before.getValue());
      //if(before != null) features.add("before" + (inum+1) + "=" + before.getValue());
      if(before != null) features.add("ui=" + before.getValue());
    }
   
    return features;
  }
View Full Code Here

      //NameRecogniser nr = new NameRecogniser();
      //nr.halfProcess(sourceDoc);
      //nr.buildTokenTables(safDoc.getRootElement(), false, false);

      for(Entry e : entriesByFileName.get(fn)) {
        Token t = procDoc.getTokenByStart(e.start);
        if(t != null) {
          List<String> features = featuresForToken(t);
          Event event = new Event(e.type, features.toArray(new String[0]));
          events.add(event);
        }
View Full Code Here

    for(NamedEntity ne : neList) {
      if(ne.getType().equals(NETypes.POTENTIALACRONYM)) {
        int start = ne.getStart();
        //int end = ne.getEnd();

        Token t = tokensByStart.get(start);
        //if(t != null) System.out.println("AHA: " + t.getValue());
        if(t != null && t.getNAfter(-2) != null && t.getNAfter(1) != null) {
          Token prev = t.getNAfter(-1);
          Token next = t.getNAfter(1);
          Token prevPrev = t.getNAfter(-2);
          if(prev.getValue().equals("(") && next.getValue().endsWith(")")) {
            //boolean matched = false;
            if(endToNe.containsKey(prevPrev.getEnd())) {
              NamedEntity acronymOf = endToNe.get(prevPrev.getEnd());
              if(StringTools.testForAcronym(ne.getSurface(), acronymOf.getSurface())) {
                //System.out.println(ne.getSurface() + " is " + acronymOf.getSurface());
                if(acronymOf.getType().equals(NETypes.ASE) || acronymOf.getType().equals(NETypes.ASES)) {
                  //System.out.println("Skip ASE acronym");
                } else {
View Full Code Here

    boolean doSort = true;
    for(String s : ss) {
      s = s.replaceAll("\\s+", " ");
      TokenSequence t = Tokeniser.getInstance().tokenise(s);
      for(int i=0;i<t.size();i++) {
        Token token = t.getToken(i);
        if(token.getValue().equalsIgnoreCase(word1)) {
          for(int j=i+1;j<t.size() && j<i+10;j++) {
            Token token2 = t.getToken(j);
            if(token2.getValue().equalsIgnoreCase(word2)) {
              int wstart1 = token.getStart();
              int wend1 = token.getEnd();
             
              int wstart2 = token2.getStart();
              int wend2 = token2.getEnd();
             
              String before = null;
              if(wstart1 < beforeWidth) {
                before = StringTools.multiplyString(" ", beforeWidth - wstart1) + s.substring(0, wstart1);
              } else {
                before = s.substring(wstart1 - beforeWidth, wstart1);
              }
             
              String after = null;
              if(s.length() - wend2 > afterWidth) {
                after = s.substring(wend2, wend2 + afterWidth);
              } else {
                after = s.substring(wend2);
              }
             
              String middle = s.substring(wend1, wstart2);
              String middleDisplay = middle;
              if(middle.length() > middleWidth) continue;
              if(middle.length() < middleWidth) {
                if(mode.startsWith("left")) {
                  middleDisplay += StringTools.multiplyString(" ", middleWidth - middle.length());                                   
                } else if(mode.startsWith("right")) {
                  middleDisplay = StringTools.multiplyString(" ", middleWidth - middle.length()) + middleDisplay;                 
                } else {
                  int toPad = middleWidth - middle.length();
                  int left = toPad / 2;
                  int right = toPad - left;
                  middleDisplay = StringTools.multiplyString(" ", left) +
                    middleDisplay + StringTools.multiplyString(" ", right);
                }
              }
             
              String display = before + "  " + token.getValue() + "  " + middleDisplay + " " + token2.getValue() + " " + after;
              String sort = "";
              if("interleave".equals(mode)) {
                sort = middleDisplay.toLowerCase() + interleave(before, after, true);           
              } else if("left".equals(mode)) {
                StringBuffer sb = new StringBuffer(before.toLowerCase());
View Full Code Here

   *
   * @param word The string to test.
   */
  public static void destroyInstanceIfWordTokenises(String word) {
    if(myInstance == null) return;
    TokenSequence ts = Tokeniser.getInstance().tokenise(word);
    if(ts.getTokens().size() > 1) myInstance = null;
  }
View Full Code Here

   *
   * @param word The string to test.
   */
  public static void destroyInstanceIfWordTokenises(String word) {
    if(myInstance == null) return;
    TokenSequence ts = Tokeniser.getInstance().tokenise(word);
    if(ts.getTokens().size() > 1) myInstance = null;
  }
View Full Code Here

        ontDFANumber++;
      }
      type = type + "_" + ontDFANumber;*/
    }

    TokenSequence ts = Tokeniser.getInstance().tokenise(ne);
    List<String> tokens = ts.getTokenStringList();

    if(!alwaysAdd && tokens.size() == 1 && !ne.contains("$")) return;
    StringBuffer sb = new StringBuffer();
    for(String token : tokens) {
      sb.append(getRepForToken(token));
View Full Code Here

                featuresForAbbrev.add(abbrMode + "abbr<" + thresh);
              }             
            }
          } else {
            int tokID = ne.getFirstToken().getId();
            TokenSequence tokSeq = ne.getFirstToken().getTokenSequence();
            int length = surf.length();
            boolean isAcro = false;
            if(allCaps.matcher(surf).matches()) {
              if(length <= (tokID - 1)) {
                isAcro = true;
                for(int i=0;i<length;i++) {
                  if(!tokSeq.getToken(tokID - length - 1 + i).getValue().toUpperCase().startsWith(surf.substring(i,i+1))) isAcro = false;
                }
                if(isAcro) {
                  featuresForAbbrev.add("allUpperAbbrev");
                }
              }
View Full Code Here

    double conf = ne.getConfidence();
    double confLog = Math.log(conf) - Math.log(1 - conf);

    List<String> features = new ArrayList<String>();
   
    TokenSequence t = ne.getTokens().get(0).getTokenSequence();
    int entityLength = ne.getTokens().size();
    int startID = ne.getTokens().get(0).getId();
    int endID = startID + entityLength - 1;
    String surf = ne.getSurface();
   
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.Token

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.