Package uk.ac.cam.ch.wwmm.oscar3.recogniser.document

Examples of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.TokenSequenceSource


   
    Bag<String> wordCounts = new Bag<String>();
   
    ss.reset();
    for(String s : ss) {
      TokenSequence t = Tokeniser.getInstance().tokenise(s);
      for(String word : t.getTokenStringList()) {
        if(!word.matches(".*[a-z][a-z].*")) continue;
        word = StringTools.normaliseName(word);
        wordCounts.add(word);
      }
    } 
View Full Code Here


    Collections.sort(offsetArray, offsetComparator);
    System.out.println(System.currentTimeMillis() - time)
  }
 
  public int [] searchForString(String searchString) {
    TokenSequence ts = Tokeniser.getInstance().tokenise(searchString);
    List<Integer> tsl = new ArrayList<Integer>(ts.size() + 1);
    for(Token t : ts.getTokens()) {
      String s = t.getValue().toLowerCase();
      if(tokenIndex.containsKey(s)) {
        tsl.add(tokenIndex.get(s));
      } else {
        return null;
View Full Code Here

      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));
        if(tokSeqs.containsKey(tokSeq)) {
          if(isReact) tokSeqs.put(tokSeq, true);
        } else {
          tokSeqs.put(tokSeq, isReact);
View Full Code Here

        c = reader.read();
      }     
    } catch (Exception e) {
      throw new Error(e);
    }
    TokenSequence t = Tokeniser.getInstance().tokenise(sb.toString());
   
    TokenStream ts = new Oscar3TokenStream(t);
    ts = new Oscar3Filters(ts);
    return ts;
  }
View Full Code Here

      Token token = procDoc.getTokenByStart(e.getAttributeValue("from"));
      if(token == null) token = procDoc.getTokenByEnd(e.getAttributeValue("to"));
      if(token == null) {
        System.out.println("Eeep!");
      } else {
        TokenSequence tokSeq = token.getTokenSequence();
        //boolean isReact = "REACT".equals(SafTools.getSlotValue(e, "subtype"));
        boolean isPubmed = f.toString().contains("pubmed");
        if(tokSeqs.containsKey(tokSeq)) {
          if(isPubmed) tokSeqs.put(tokSeq, true);
        } else {
View Full Code Here

    //files = files.subList(0, 10);
   
   
    Bag<String> tokenBag = new Bag<String>();
   
    TokenSequenceSource tss = new TokenSequenceSource(files);
    int i=0;
    for(TokenSequence ts : tss) {
      for(Token t : ts.getTokens()) {
        tokenBag.add(t.getValue().intern());
      }
View Full Code Here

      corpusOffset++;
    }
  }
 
  public InverseSearcher(List<File> files) throws Exception {
    TokenSequenceSource tss = new TokenSequenceSource(files);

    corpusArray = new ArrayList<Integer>();
    offsetArray = new ArrayList<Integer>();
    tokenList = new ArrayList<String>();
    tokenIndex = new HashMap<String,Integer>();
View Full Code Here

    }
    Bag<String> allTerms = new Bag<String>();
    String word = "reduction";
    List<File> files = FileTools.getFilesFromDirectoryByName(new File("/home/ptc24/newows/corpora/BioIE"), "source.xml");
    //List<File> files = FileTools.getFilesFromDirectoryByName(new File("/scratch/pubmed/2005"), "source.xml");
    TokenSequenceSource ts = new TokenSequenceSource(files);
    for(TokenSequence t : ts) {
      List<String> tokens = t.getTokenStringList();
      for(int i=0;i<tokens.size();i++) {
        if(tokens.get(i).equals(word)) {
          for(int j=Math.max(0, i-windowSize);j<=Math.min(i+windowSize,tokens.size()-1);j++) {
View Full Code Here

      for(String word : t.getTokenStringList()) {
        terms.add(StringTools.normaliseName(word));
      }
    }*/
   
    TokenSequenceSource tokSeqSource = new TokenSequenceSource(files);
    tokSeqSource.reset();
    for(TokenSequence t : tokSeqSource) {
      for(String word : t.getTokenStringList()) {
        word = StringTools.normaliseName(word);
        word = word.replaceAll("\\s+", "_");
        terms.add(word.intern());
View Full Code Here

      beforeWidth = allowableWidth / 3;
      afterWidth = (2 * allowableWidth) / 3;     
    }*/
   
    //StringSource ss = new StringSource(files, false);
    TokenSequenceSource ts = new TokenSequenceSource(files);
    boolean doSort = true;
    for(TokenSequence t : ts) {
      //s = s.replaceAll("\\s+", " ");
      //Tokeniser t = new Tokeniser(null);
      //t.tokenise(s);
      int sstart = t.getOffset();
      int send = t.getSourceString().length() + sstart;
      for(Token token : t.getTokens()) {
        if(token.getValue().equalsIgnoreCase(word)) {
          int wstart = token.getStart();
          int wend = token.getEnd();
          String before = t.getStringAtOffsets(Math.max(sstart, wstart-beforeWidth), wstart);
          before = before.replaceAll("\\s+", " ");
          if(before.startsWith(" ")) before = "." + before.substring(1);
          if(before.length() < beforeWidth) {
            before = StringTools.multiplyString(".", beforeWidth - before.length()) + before;
          }
          //if(wstart < beforeWidth) {
          //  before = StringTools.multiplyString(" ", beforeWidth - wstart) + t.getStringAtOffsets(0, wstart);
          //} else {
          //  before = t.getStringAtOffsets(wstart - beforeWidth, wstart);
          //}
          String after = t.getStringAtOffsets(wend, Math.min(send, wend + afterWidth));
          after = after.replaceAll("\\s+", " ");
          //if(s.length() - wend > afterWidth) {
          //  after = s.substring(wend, wend + afterWidth);
          //} else {
          //  after = s.substring(wend);
          //}
          String display = before + "  " + token.getValue() + "  " + after;
          String sort = "";
          if("interleave".equals(mode)) {
            sort = interleave(before, after, true);           
          } else if("left".equals(mode)) {
            StringBuffer sb = new StringBuffer(before.toLowerCase());
            sb.reverse();
            sort = sb + "\n" + after.toLowerCase();
          } else if("right".equals(mode)) {
            sort = after.toLowerCase() + "\n" + before.toLowerCase();           
          } else {
            doSort = false;
          }
          ConcordanceEntry entry = new ConcordanceEntry();
          entry.text = display;
          entry.start = token.getStartXPoint();
          entry.end = token.getEndXPoint();
          entry.file = ts.getCurrentFile().getAbsolutePath();
          concordance.put(entry, sort);
        }
      }
    }
    if(!doSort) {
View Full Code Here

TOP

Related Classes of uk.ac.cam.ch.wwmm.oscar3.recogniser.document.TokenSequenceSource

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.