Package uk.ac.cam.ch.wwmm.oscar3.recogniser.document

Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument


                featuresForAbbrev.add(abbrMode + "abbr<" + thresh);
              }             
            }
          } else {
            int tokID = ne.getFirstToken().getId();
            TokenSequence tokSeq = ne.getFirstToken().getTokenSequence();
            int length = surf.length();
            boolean isAcro = false;
            if(allCaps.matcher(surf).matches()) {
              if(length <= (tokID - 1)) {
                isAcro = true;
                for(int i=0;i<length;i++) {
                  if(!tokSeq.getToken(tokID - length - 1 + i).getValue().toUpperCase().startsWith(surf.substring(i,i+1))) isAcro = false;
                }
                if(isAcro) {
                  featuresForAbbrev.add("allUpperAbbrev");
                }
              }
View Full Code Here


    double conf = ne.getConfidence();
    double confLog = Math.log(conf) - Math.log(1 - conf);

    List<String> features = new ArrayList<String>();
   
    TokenSequence t = ne.getTokens().get(0).getTokenSequence();
    int entityLength = ne.getTokens().size();
    int startID = ne.getTokens().get(0).getId();
    int endID = startID + entityLength - 1;
    String surf = ne.getSurface();
   
View Full Code Here

     
      Bag<String> wordCounts = new Bag<String>();
     
      ss.reset();
      for(String s : ss) {
        TokenSequence t = Tokeniser.getInstance().tokenise(s);
        for(String word : t.getTokenStringList()) {
          if(!word.matches(".*[a-z][a-z].*")) continue;
          word = StringTools.normaliseName(word);
          if(!knownWords.contains(NGram.parseWord(word))) wordCounts.add(word);
        }
      } 
View Full Code Here

      Elements lineElems = elems.get(i).getChildElements();
      //List<String> wordsForWord = new ArrayList<String>();
      Set<String> wordsForWord = new HashSet<String>();
      for(int j=0;j<lineElems.size();j++) {
        String lineVal = lineElems.get(j).getValue();
        TokenSequence t = Tokeniser.getInstance().tokenise(lineVal);
        boolean found = false;
        List<String> tl = new ArrayList<String>();
        for(String s : t.getTokenStringList()) {
          if(s.compareToIgnoreCase(name) == 0) {
            tl.add("*WORD*");
            found = true;
          } else {
            //tl.add(StringTools.normaliseName2(s));
View Full Code Here

   
    int nb = 0;
    int b = 0;
   
    for(int i=0;i<n.size();i++) {
      TokenSequence t = Tokeniser.getInstance().tokenise(n.get(i).getValue());
      boolean isBoring = true;
      for(String s : t.getTokenStringList()) {
        s = StringTools.normaliseName(s);
        if(!s.matches(".*[A-Za-z].*")) continue;
        if(boring.contains(s)) continue;
        boring.add(s);
        isBoring = false;
View Full Code Here

   
    Bag<String> wordCounts = new Bag<String>();
   
    ss.reset();
    for(String s : ss) {
      TokenSequence t = Tokeniser.getInstance().tokenise(s);
      for(String word : t.getTokenStringList()) {
        if(!word.matches(".*[a-z][a-z].*")) continue;
        word = StringTools.normaliseName(word);
        wordCounts.add(word);
      }
    } 
View Full Code Here

    Collections.sort(offsetArray, offsetComparator);
    System.out.println(System.currentTimeMillis() - time)
  }
 
  public int [] searchForString(String searchString) {
    TokenSequence ts = Tokeniser.getInstance().tokenise(searchString);
    List<Integer> tsl = new ArrayList<Integer>(ts.size() + 1);
    for(Token t : ts.getTokens()) {
      String s = t.getValue().toLowerCase();
      if(tokenIndex.containsKey(s)) {
        tsl.add(tokenIndex.get(s));
      } else {
        return null;
View Full Code Here

      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));
        if(tokSeqs.containsKey(tokSeq)) {
          if(isReact) tokSeqs.put(tokSeq, true);
        } else {
          tokSeqs.put(tokSeq, isReact);
View Full Code Here

    //files = files.subList(0, 10);
   
   
    Bag<String> tokenBag = new Bag<String>();
   
    TokenSequenceSource tss = new TokenSequenceSource(files);
    int i=0;
    for(TokenSequence ts : tss) {
      for(Token t : ts.getTokens()) {
        tokenBag.add(t.getValue().intern());
      }
View Full Code Here

      corpusOffset++;
    }
  }
 
  public InverseSearcher(List<File> files) throws Exception {
    TokenSequenceSource tss = new TokenSequenceSource(files);

    corpusArray = new ArrayList<Integer>();
    offsetArray = new ArrayList<Integer>();
    tokenList = new ArrayList<String>();
    tokenIndex = new HashMap<String,Integer>();
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.