Package uk.ac.cam.ch.wwmm.oscar3.preprocessor

Examples of uk.ac.cam.ch.wwmm.oscar3.preprocessor.HTMLPreprocessor


    Map<TokenSequence,Boolean> tokSeqs = new HashMap<TokenSequence,Boolean>();
    Map<TokenSequence,Bag<String>> tokSeqPRWs = new HashMap<TokenSequence,Bag<String>>();
   
    for(int i=0;i<n.size();i++) {
      Element e = (Element)n.get(i);
      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));
        if(tokSeqs.containsKey(tokSeq)) {
          if(isReact) tokSeqs.put(tokSeq, true);
        } else {
          tokSeqs.put(tokSeq, isReact);
View Full Code Here


    }
   
    //if(surface.matches("([Pp]oly).+")) features.add("polymer");
    //if(surface.matches(".+\\(\\d\\d\\d+\\)")) features.add("surfacenotation");
   
    Token t = state.procDoc.getTokenByStart(annot.getAttributeValue("from"));
    if(fPrevious && t != null) {
      Token tt = t.getNAfter(-1);
      if(tt != null) {
        for(int i=1;i<=1;i++) {
          if(TokenTypes.isRef(tt) && tt.getNAfter(-1) != null) {
            //  features.add(prefix + "skiprefprev");
            tt = tt.getNAfter(-1);
          }
          String ttv = tt.getValue();
          ttv = ttv.replaceAll("\\s+", "_");
          //if(i == 1) features.add("prev" + 1 + "=" + ttv);
          features.add("pbg" + i + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));
          //features.add("pbg" + (i+1) + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));
          //features.add("pbg" + 0 + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));
          //features.add("uibg" + 0 + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));
         
          tt = tt.getNAfter(-1);
          if(tt == null) break;
        }       

       
        //if(ttv.length() > 4) features.add("prevs=" + ttv.substring(ttv.length()-4) + "_" + surface.replaceAll("\\s+", "_"));
        //features.add("psbg=" + ttv + "_" + suffix.replaceAll("\\s+", "_"));
        //features.add("pstbg=" + ttv + "_" + stem);
       
      }
    }
   
    t = state.procDoc.getTokenByEnd(annot.getAttributeValue("to"));
    if(fNext && t != null) {
      Token tt = t.getNAfter(1);
      //if(tt != null && tt.getValue().equals("-")) {
      //  tt = t.getNAfter(2);
      //}
      if(tt != null) {
        for(int i=1;i<=1;i++) {
          if(TokenTypes.isRef(tt) && tt.getNAfter(1) != null) {
            tt = tt.getNAfter(1);
          }
         
          String ttv = tt.getValue();
          ttv = ttv.replaceAll("\\s+", "_");
          //if(i == 1) features.add("next=" + ttv);
          features.add("nbg" + i + "=" + surface.replaceAll("\\s+", "_"+ "_" + ttv);
         
          //features.add("uibg" + 0 + "=" + ttv + "_" + surface.replaceAll("\\s+", "_"));

         
          tt = tt.getNAfter(1);
          if(tt == null) break;
        }
        //if(ttv.length() > 4) features.add("nexts=" + surface.replaceAll("\\s+", "_")  + "_" + ttv.substring(ttv.length()-4));
        //features.add("nsbg=" + suffix.replaceAll("\\s+", "_")  + "_" + ttv);
        //features.add("nstbg=" + stem  + "_" + ttv);
View Full Code Here

    }
   
    //if(surface.matches("([Pp]oly).+")) features.add("polymer");
    //if(surface.matches(".+\\(\\d\\d\\d+\\)")) features.add("surfacenotation");
   
    Token t = state.procDoc.getTokenByStart(annot.getAttributeValue("from"));
    if(fPrevious && t != null) {
      Token tt = t.getNAfter(-1);
      if(tt != null) {
        if(TokenTypes.isRef(tt) && tt.getNAfter(-1) != null) {
          //  features.add(prefix + "skiprefprev");
          tt = tt.getNAfter(-1);
        }
        String ttv = tt.getValue();
        ttv = ttv.replaceAll("\\s+", "_");
        features.add("prev=" + ttv);
      }
    }
   
    t = state.procDoc.getTokenByEnd(annot.getAttributeValue("to"));
    if(fNext && t != null) {
      Token tt = t.getNAfter(1);
      if(tt != null && tt.getValue().equals("-")) {
        tt = t.getNAfter(2);
      }
      if(tt != null) {
        if(TokenTypes.isRef(tt) && tt.getNAfter(1) != null) {
          tt = tt.getNAfter(1);
        }
       
        String ttv = tt.getValue();
        ttv = ttv.replaceAll("\\s+", "_");
        features.add("next=" + ttv);
      }
    }
   
View Full Code Here

    Map<TokenSequence,Boolean> tokSeqs = new HashMap<TokenSequence,Boolean>();
    Map<TokenSequence,Bag<String>> tokSeqPRWs = new HashMap<TokenSequence,Bag<String>>();
   
    for(int i=0;i<n.size();i++) {
      Element e = (Element)n.get(i);
      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        //boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));
        boolean isPubmed = f.toString().contains("pubmed");
        if(tokSeqs.containsKey(tokSeq)) {
          if(isPubmed) tokSeqs.put(tokSeq, true);
        } else {
View Full Code Here

   *
   * @param word The string to test.
   */
  public static void destroyInstanceIfWordTokenises(String word) {
    if(myInstance == null) return;
    TokenSequence ts = Tokeniser.getInstance().tokenise(word);
    if(ts.getTokens().size() > 1) myInstance = null;
  }
View Full Code Here

   *
   * @param word The string to test.
   */
  public static void destroyInstanceIfWordTokenises(String word) {
    if(myInstance == null) return;
    TokenSequence ts = Tokeniser.getInstance().tokenise(word);
    if(ts.getTokens().size() > 1) myInstance = null;
  }
View Full Code Here

        ontDFANumber++;
      }
      type = type + "_" + ontDFANumber;*/
    }

    TokenSequence ts = Tokeniser.getInstance().tokenise(ne);
    List<String> tokens = ts.getTokenStringList();

    if(!alwaysAdd && tokens.size() == 1 && !ne.contains("$")) return;
    StringBuffer sb = new StringBuffer();
    for(String token : tokens) {
      sb.append(getRepForToken(token));
View Full Code Here

                featuresForAbbrev.add(abbrMode + "abbr<" + thresh);
              }             
            }
          } else {
            int tokID = ne.getFirstToken().getId();
            TokenSequence tokSeq = ne.getFirstToken().getTokenSequence();
            int length = surf.length();
            boolean isAcro = false;
            if(allCaps.matcher(surf).matches()) {
              if(length <= (tokID - 1)) {
                isAcro = true;
                for(int i=0;i<length;i++) {
                  if(!tokSeq.getToken(tokID - length - 1 + i).getValue().toUpperCase().startsWith(surf.substring(i,i+1))) isAcro = false;
                }
                if(isAcro) {
                  featuresForAbbrev.add("allUpperAbbrev");
                }
              }
View Full Code Here

    double conf = ne.getConfidence();
    double confLog = Math.log(conf) - Math.log(1 - conf);

    List<String> features = new ArrayList<String>();
   
    TokenSequence t = ne.getTokens().get(0).getTokenSequence();
    int entityLength = ne.getTokens().size();
    int startID = ne.getTokens().get(0).getId();
    int endID = startID + entityLength - 1;
    String surf = ne.getSurface();
   
View Full Code Here

     
      Bag<String> wordCounts = new Bag<String>();
     
      ss.reset();
      for(String s : ss) {
        TokenSequence t = Tokeniser.getInstance().tokenise(s);
        for(String word : t.getTokenStringList()) {
          if(!word.matches(".*[a-z][a-z].*")) continue;
          word = StringTools.normaliseName(word);
          if(!knownWords.contains(NGram.parseWord(word))) wordCounts.add(word);
        }
      } 
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.preprocessor.HTMLPreprocessor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.