Package edu.pitt.info.extract.model

Examples of edu.pitt.info.extract.model.AnnotatedDocument


  private void processReport(List<Template> templates, File reportFile,File outputDir) throws Exception {
    progress("processing report ("+(processCount+1)+") "+reportFile.getName()+" ... ");
   
    // read in the report
    String text = TextTools.getText(new FileInputStream(reportFile));
    AnnotatedDocument doc = new AnnotatedDocument();
    doc.setName(reportFile.getName());
    doc.setText(text);
    doc.getFilters().addAll(DocumentFilter.getDeIDFilters());
   
    // combine terminologies into a single instance and add filters
    CompositTerminology terminology = new CompositTerminology();
    for(Template t: templates){
      doc.getFilters().addAll(t.getFilters());
      terminology.addTerminology(t.getTerminology());
    }
   
   
    // do a simple parsing of this document
    long time = System.currentTimeMillis();
    int offset = 0;
    for(String line: getLines(doc.getFilteredDocument())){
      // skip synoptic sections
      if(doc.isSynopticSection(offset))
        continue;
   
      for(String sentence: getSentences(line)){
        for(String phrase: getPhrases(sentence) ){
          int offs = line.indexOf(phrase);
          for(Concept c: terminology.search(phrase,IndexFinderTerminology.BEST_MATCH)){
            for(Annotation a: c.getAnnotations()){ 
              if(a.getOffset() < (offset+offs))
                a.updateOffset(offset+offs);
              doc.addAnnotation(a);
            }
            doc.addConcept(c);
          }
        }
      }
      offset += (line.length()+1);
    }
    doc.sort();
   
  /*  System.out.println("--------------");
    for(Annotation a: doc.getAnnotations()){
      System.out.println(a.getText()+" ... ("+a.getStartPosition()+","+a.getEndPosition()+ ") \t\t"+a.getConcept().getName()+" ... "+a.getConcept().getCode());
    }
    System.out.println("\n----------------( "+time+" )-------------------\n");*/
   
   
    // get a list of processed concepts
    Map<Template,List<ItemInstance>> resultMap = new LinkedHashMap<Template, List<ItemInstance>>();
   
    // now lets do information extraction
    for(Template template: templates){
      if(template.isAppropriate(doc)){
        List<ItemInstance> items = template.process(doc);
        resultMap.put(template,items);
       
        // re-add annotation
        for(ItemInstance i: items){
          for(Annotation a: i.getAnnotations()){
            if(!doc.getAnnotations().contains(a))
              doc.getAnnotations().add(a);
          }
        }
      }
    }
    doc.sort();
   
    long total = System.currentTimeMillis()-time;
    processCount ++;
    processTime += total;
     
View Full Code Here

TOP

Related Classes of edu.pitt.info.extract.model.AnnotatedDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.