Package org.wltea.analyzer.lucene

Examples of org.wltea.analyzer.lucene.IKQueryParser$ExpressionParser$Element


      ArrayList<String> syns=new ArrayList<String>(1);
      syns.add(sid);
      s=new Sense(sid,"",syns);
      URL url=new URL(this.path+"wiki/"+sid.replace(" ", "_"));
      Document xml=this.loadURL(url);
      Element body=this.getContentNode(xml);
      //Check if definition is a section
      if(!xml.getRootElement().getValue().contains(this.redirect))
      {
        boolean start=false;
       
        for(Element e:body.getChildren())
        {
          if(e.getName().equals("p"))
          {
            start=true;
            s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
            for(Element ge:e.getChildren())
            {
              //Retrieve all the <a> for extracting the inGloss relation
              if(ge.getName().equals("a"))
              {
                String nurl=ge.getAttributeValue("href");
                s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));           
              }
            }
          }
          else
          {
            if(start)
              break;
          }
        }

      }
      else//Look for the start of the section
      {
        String all=xml.getRootElement().getValue();
        String section=all.substring(all.indexOf(this.redirect)+this.redirect.length());
        section=section.substring(0,section.indexOf("\")"));
       
        boolean start=false;
        for(Element e:body.getChildren())
        {
          if(start)
          {
            if(e.getName().equals("p"))
            {
              s.addBagOfWords(e.getValue(),e.getValue().split(" "),this.name);
              for(Element ge:e.getChildren())
              {
                //Retrieve all the <a> for extracting the inGloss relation
                if(ge.getName().equals("a"))
                {
                  String nurl=ge.getAttributeValue("href");
                  s.addRelation("inGloss", new Relation("inGloss", nurl.replace("/wiki/", ""), ""));           
                }
              }
            }
            else
            {
              break;
            }

          }
          else
          {
            if(e.getName().startsWith("h"))
            {
              for(Element es:e.getChildren())
              {
                if(es.getName().equals("span")&&section.equals(es.getAttributeValue("id")))
                {               
                  start=true;
                }
              }
            }
          }
         
        }
      }
      Element navbox=null;
      for(Element e:xml.getRootElement().getDescendants(new ElementFilter("table")))
      {
        if(e.getAttributeValue("class")!=null&&e.getAttributeValue("class").equals("navbox"))
        {
            navbox=e;
            break;
        }
      }
      //Add inNavBox relations
      if(navbox!=null)
      {
        for(Element e:navbox.getDescendants(new ElementFilter("a")))
        {
          String nurl=e.getAttributeValue("href");
          if(nurl!=null&&!this.isNotAnArticle(nurl))
          {
            s.addRelation("inNavBox", new Relation("inNavBox", nurl.replace("/wiki/", ""), ""));
          }
        }
      }
      //Add in CatLinks relations
      Element catlinks=null;
      String aux="catlinks";
      for(Element e:body.getParent().getDescendants(new ElementFilter("div")))
      {
        if(aux.equals(e.getAttributeValue("id")))
        {
          catlinks=e;
          break;
        }
      }
      if(catlinks!=null)
      {
        for(Element e:catlinks.getDescendants(new ElementFilter("a")))
        {
          String nurl=e.getAttributeValue("href");
          if(nurl!=null&&!this.isNotAnArticle(nurl))
          {
            s.addRelation("inCatLinks", new Relation("inCatLinks", nurl.replace("/wiki/", ""), ""));
View Full Code Here


      f.mkdirs();
      FileWriter fout=new FileWriter(path+sid+".sgf");
      BufferedWriter out=new BufferedWriter(fout);
    URL url=new URL(this.path+"wiki/"+sid);
    Document xml=this.loadURL(url);
    Element body=this.getContentNode(xml);   
    DataBroker db=new DataBroker("gannuNLP.dictionaries.Wiki",this.language);
    db.setPath(this.path);   
    db.load("Glosses");
   
    out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
    out.write("<contextfile concordance=\""+this.name.replace("->", ".")+"\">\n");
    out.write("\t<context filename=\""+sid.replace("&","&amp;")+"\" paras=\"yes\">\n");   
    int p=1;
    int s=1;
    String paragraph="";
    String sentence="";
    paragraph+="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
    sentence+="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
    ArrayList<Content> stack=new ArrayList<Content>();
    stack.addAll(body.getContent());
    while(stack.size()>0)
    {
      Content c=stack.get(0);
      stack.remove(0);
      if(c.getCType().equals(CType.Text))//actual text
      {
        //a dot creates a new sentence after processing
        String line=c.getValue().trim();
        while(!line.equals(""))
        {         
          int idx=line.indexOf(" ");
          String words;
          if(idx>=0)
            words=line.substring(0,idx);
          else
            words=line;
          line=line.substring(words.length()).trim();
          String punct=words.replaceAll("\\p{Punct}","�");
          int index=0;
          while(!punct.equals(""))
          {
            idx=punct.indexOf("�");
            String word;
            if(idx>=0)
              word=punct.substring(0,idx);
            else
              word=punct;
            if(word.equals(""))
            {
              //first the punctuation then the word
              //add a punc node
             
              if(words.charAt(index)=='<')
              {
                sentence+="\t\t\t\t<punc>&lt;</punc>\n";
              }
              else
              {
                if(words.charAt(index)=='>')
                  sentence+="\t\t\t\t<punc>&gt;</punc>\n";
                else
                  sentence+="\t\t\t\t<punc>"+words.charAt(index)+"</punc>\n";
              }
              if(words.charAt(index)=='.')
              {
                sentence+=("\t\t\t</s>\n");
                if(sentence.contains("wf"))
                {
                  System.out.print(".");
                  s++;
                  paragraph+=sentence;
                }
                sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
              }
              index++;
              punct=punct.substring(1);
            }
            else
            {
              index+=word.length();
              sentence+="\t\t\t\t<wf cmd=\"tag\" pos=\"\" lemma=\""+word+"\" wnsn=\"0\" lexsn=\"NA\">";
              sentence+=word;
              sentence+="</wf>\n";
              punct=punct.substring(word.length());
            }
             
          }
           
        }
      }
      if(c.getCType().equals(CType.Element))//other html elements such a or table should extract the text inside these elements
      {
        Element current=(Element)c;
        //tr creates a new sentence after processing
        String href=current.getAttributeValue("href");
     
        String aux="navbox";
        if(aux.equals(current.getAttributeValue("class")))
          break;
        if(href!=null&&current.getName().equals("a")&&!this.isNotAnArticle(href)&&!href.contains("Category:"))
        {
          if(!href.contains("%25"))
          {
            while(href.contains("%"))
            {
              int index=href.indexOf("%");         
              String first=href.substring(0,index);
              if(index>href.length())
                index=href.length();
              String last=href.substring(index+3);
              String hex="0x"+href.substring(index+1,index+3);
              byte b[];
              if(last.startsWith("%"))
              {
                b=new byte[2];             
                b[0]=(byte)Integer.decode(hex).intValue();
                b[1]=(byte)Integer.decode("0x"+last.substring(1,3)).intValue();
                last=last.substring(3);
              }
              else
              {
                b=new byte[1];
                b[0]=(byte)Integer.decode(hex).intValue();
              }
              href=first+new String(b,"UTF-8")+last;
            }
          }
          //Lematize the wiki word
          String word=current.getValue();
          String lemma=word;
          Lemma l=db.getLemma(word);
          href=href.substring(href.indexOf("wiki/")+5);
          boolean ac=true;
          if(l==null)
          {
              l=db.getLemma(href);
              if(l!=null)
                lemma=l.getLemma();
              ac=false;
          }
          if(l!=null)
          {
            int i=0;
            boolean ban=false;
            for(Sense sense:l.getSenses())
            {
              i++;
              if(sense.getSid().equals(href))
              {
                ban=true;
                break;
              }
            }
            String wnsn="";
            if(ban)
            {
              wnsn=String.valueOf(i);
            }
            else
            {
              if(ac)
              {
                l=db.getLemma(href);
                if(l!=null)
                {
                  i=0;
                  ban=false;
                  for(Sense sense:l.getSenses())
                  {
                    i++;
                    if(sense.getSid().equals(href))
                    {
                      ban=true;
                      break;
                    }
                  }
                  if(ban)
                    wnsn=String.valueOf(i);
                }
              }
            }
            if(wnsn.equals("")&&l!=null)
            {
              Sense sense=this.getSense(href);
              ban=false;
              i=0;
              for(Sense sx:l.getSenses())
              {
                i++;
                if(sense.itContainsTheSameSamples(sx))
                {
                  ban=true;
                  break;
                }
              }
            }
            if(ban)
              wnsn=String.valueOf(i);
            if(wnsn.equals(""))
            {  stack.addAll(0,current.getContent());
              out.write("\t\t\t\t<!--Mismatch link for "+href.replace("&","&amp;")+" -->\n");
            }
            else
            {
              sentence+="\t\t\t\t<wf cmd=\"done\" pos=\"\" lemma=\""+Dictionary.normalizeLemmaforFile(lemma)
                  +"\" wnsn=\""+wnsn
                  +"\" lexsn=\""+Dictionary.normalizeLemmaforFile(l.getSenses().get(i-1).getSid())+"\">";
              sentence+=word;
              sentence+="</wf>\n";
            }
          }
          else
          {
            stack.addAll(0,current.getContent());
          }
         
        }
        else
        {
          if(current.getName().equals("tr")||current.getName().equals("p"))
          {
            sentence+=("\t\t\t</s>\n");
            if(sentence.contains("wf"))
            {
              System.out.print(".");
              s++;
              paragraph+=sentence;
            }
            if(paragraph.contains("wf"))
            {
              System.out.println("Saving paragraph "+String.valueOf(p));
              p++;
              paragraph+=("\t\t</p>\n");
              out.write(paragraph.replace("&","&amp;"));
            }
            s=1;
            sentence="\t\t\t<s snum=\""+String.valueOf(s)+"\">\n";
            paragraph="\t\t<p pnum=\""+String.valueOf(p)+"\">\n";
          }
          stack.addAll(0,current.getContent());       
        }
      }
    }
    sentence+=("\t\t\t</s>\n");
    if(sentence.contains("wf"))
View Full Code Here

    return this.sq;
  }
 
  public void saveSQ(double sq)
  {
    Element thisSQ = new Element("sq");
    thisSQ.setText(sq+"");
    this.root.addContent(thisSQ);
    this.saveXML();
  }
View Full Code Here

      {
        String actualLanguage = (((Element) langList.getSelectedItem()).getName());
       
        String contentStr = content.getText();
       
        Element text = new Element(key);
        text.setText(contentStr);
       
        this.la.root.getChild(actualLanguage).addContent(text);
       
        this.saveXML();
       
View Full Code Here

   * </p>
   */
  private void bouttonSaveLangClicked()
  {
    String lang = langField.getText();
    this.la.root.addContent(new Element(lang));
    this.saveXML();
    this.newLangDialog.setVisible(false);
  }
View Full Code Here

          ArrayList<Element> words=new ArrayList<Element>();
          for(Element word:xml.getDescendants(new ElementFilter("wf")))
            words.add(word);
          for(int w=0;w<words.size();w++)
          {
            Element word=words.get(w);
            if(word.getAttribute("ot")!=null)
            {
              word.setAttribute("cmd", "ignore");
            }
            if((word.getAttribute("cmd").getValue().equals("done"))||(tag&&(word.getAttribute("cmd").getValue().equals("tag"))))
            {
              String lemma=word.getValue();
              String pos=word.getAttribute("pos").getValue().substring(0,1);
              String lem;
              Lemma l=null;
              if(word.getAttribute("cmd").getValue().equals("done"))
                { 
                if(word.getAttribute("lemma")!=null)
                {
                  lemma=word.getAttribute("lemma").getValue();
                    pos=word.getAttribute("pos").getValue().substring(0,1);
                    lem=lemma+"_"+pos;
                    l=dic.getLemma(lem);                 
                }
                else
                {
                  lemma="";
                  pos="X";
                }
                }
              lem=lemma+"_"+pos;
 
             
              if(l==null)//lemma may not be in normal form
              {
               
                SemCorCleaner.displayWindow(words, w);
                SemCorCleaner.readLemma(dic, lemma, word, pos);
                l=dic.getLemma(word.getAttributeValue("lemma")+"_"+word.getAttributeValue("pos").substring(0,1));
              }
              if(l!=null)
              {
                if(!SemCorCleaner.isValidWNSN(word.getAttributeValue("wnsn"), l.getSenses().size()))
                {
                  SemCorCleaner.displayWindow(words, w);
                  SemCorCleaner.checkSenses(word, l);
                }
              }
View Full Code Here

     @throws IOException For other kinds of errors.
     */

    public String translate( String html, XHtmlToWikiConfig config ) throws JDOMException, IOException
    {
        Element element = htmlStringToElement( html );
        XHtmlElementToWikiTranslator xhtmlTranslator = new XHtmlElementToWikiTranslator( element, config );
        String wikiMarkup = xhtmlTranslator.getWikiString();
        return wikiMarkup;
    }
View Full Code Here

     */
    private Element htmlStringToElement( String html ) throws JDOMException, IOException
    {
        SAXBuilder builder = new SAXBuilder( CYBERNEKO_PARSER, true );
        Document doc = builder.build( new StringReader( html ) );
        Element element = doc.getRootElement();
        return element;
    }
View Full Code Here

                m_out.print( s );
            }
        }
        else if( element instanceof Element )
        {
            Element base = (Element)element;
            String n = base.getName().toLowerCase();
            if( "imageplugin".equals( base.getAttributeValue( "class" ) ) )
            {
                printImage( base );
            }
            else if( "wikiform".equals( base.getAttributeValue( "class" ) ) )
            {
                // only print the children if the div's class="wikiform", but not the div itself.
                printChildren( base );
            }
            else
            {
                boolean bold = false;
                boolean italic = false;
                boolean monospace = false;
                String cssSpecial = null;
                String cssClass = base.getAttributeValue( "class" );

                // accomodate a FCKeditor bug with Firefox: when a link is removed, it becomes <span class="wikipage">text</span>.
                boolean ignoredCssClass = cssClass != null && cssClass.matches( "wikipage|createpage|external|interwiki|attachment" );

                Map styleProps = null;
View Full Code Here

        for( Iterator i = base.getContent().iterator(); i.hasNext(); )
        {
            Object c = i.next();
            if( c instanceof Element )
            {
                Element e = (Element)c;
                String n = e.getName().toLowerCase();
                if( n.equals( "h1" ) )
                {
                    m_out.print( "\n!!! " );
                    print( e );
                    m_out.println();
                }
                else if( n.equals( "h2" ) )
                {
                    m_out.print( "\n!!! " );
                    print( e );
                    m_out.println();
                }
                else if( n.equals( "h3" ) )
                {
                    m_out.print( "\n!! " );
                    print( e );
                    m_out.println();
                }
                else if( n.equals( "h4" ) )
                {
                    m_out.print( "\n! " );
                    print( e );
                    m_out.println();
                }
                else if( n.equals( "p" ) )
                {
                    if( e.getContentSize() != 0 ) // we don't want to print empty elements: <p></p>
                    {
                        m_out.println();
                        print( e );
                        m_out.println();
                    }
                }
                else if( n.equals( "br" ) )
                {
                    if( m_preStack.isPreMode() )
                    {
                        m_out.println();
                    }
                    else
                    {
                        String parentElementName = base.getName().toLowerCase();

                        //
                        // To beautify the generated wiki markup, we print a newline character after a linebreak.
                        // It's only safe to do this when the parent element is a <p> or <div>; when the parent
                        // element is a table cell or list item, a newline character would break the markup.
                        // We also check that this isn't being done inside a plugin body.
                        //
                        if( parentElementName.matches( "p|div" )
                            && !base.getText().matches( "(?s).*\\[\\{.*\\}\\].*" ) )
                        {
                            m_out.print( " \\\\\n" );
                        }
                        else
                        {
                            m_out.print( " \\\\" );
                        }
                    }
                    print( e );
                }
                else if( n.equals( "hr" ) )
                {
                    m_out.println();
                    print( "----" );
                    print( e );
                    m_out.println();
                }
                else if( n.equals( "table" ) )
                {
                    if( !m_outTimmer.isCurrentlyOnLineBegin() )
                    {
                        m_out.println();
                    }
                    print( e );
                }
                else if( n.equals( "tr" ) )
                {
                    print( e );
                    m_out.println();
                }
                else if( n.equals( "td" ) )
                {
                    m_out.print( "| " );
                    print( e );
                    if( !m_preStack.isPreMode() )
                    {
                        print( " " );
                    }
                }
                else if( n.equals( "th" ) )
                {
                    m_out.print( "|| " );
                    print( e );
                    if( !m_preStack.isPreMode() )
                    {
                        print( " " );
                    }
                }
                else if( n.equals( "a" ) )
                {
                    if( !isIgnorableWikiMarkupLink( e ) )
                    {
                        if( e.getChild( "IMG" ) != null )
                        {
                            printImage( e );
                        }
                        else
                        {
                            String ref = e.getAttributeValue( "href" );
                            if( ref == null )
                            {
                                if( isUndefinedPageLink( e ) )
                                {
                                    m_out.print( "[" );
                                    print( e );
                                    m_out.print( "]" );
                                }
                                else
                                {
                                    print( e );
                                }
                            }
                            else
                            {
                                ref = trimLink( ref );
                                if( ref != null )
                                {
                                    if( ref.startsWith( "#" ) ) // This is a link to a footnote.
                                    {
                                        // convert "#ref-PageName-1" to just "1"
                                        String href = ref.replaceFirst( "#ref-.+-(\\d+)", "$1" );
                                       
                                        // remove the brackets around "[1]"
                                        String textValue = e.getValue().substring( 1, (e.getValue().length() - 1) );
                                       
                                        if( href.equals( textValue ) ){ // handles the simplest case. Example: [1]
                                            print( e );
                                        }                                       
                                        else{ // handles the case where the link text is different from the href. Example: [something|1]
                                            m_out.print( "[" + textValue + "|" + href + "]" );
                                        }
                                    }
                                    else
                                    {
                                        Map augmentedWikiLinkAttributes = getAugmentedWikiLinkAttributes( e );

                                        m_out.print( "[" );
                                        print( e );
                                        if( !e.getTextTrim().equalsIgnoreCase( ref ) )
                                        {
                                            m_out.print( "|" );
                                            print( ref );

                                            if( !augmentedWikiLinkAttributes.isEmpty() )
                                            {
                                                m_out.print( "|" );

                                                String augmentedWikiLink = augmentedWikiLinkMapToString( augmentedWikiLinkAttributes );
                                                m_out.print( augmentedWikiLink );
                                            }
                                        }
                                        else if( !augmentedWikiLinkAttributes.isEmpty() )
                                        {
                                            // If the ref has the same value as the text and also if there
                                            // are attributes, then just print: [ref|ref|attributes] .
                                            m_out.print( "|" + ref + "|" );
                                            String augmentedWikiLink = augmentedWikiLinkMapToString( augmentedWikiLinkAttributes );
                                            m_out.print( augmentedWikiLink );
                                        }

                                        m_out.print( "]" );
                                    }
                                }
                            }
                        }
                    }
                }
                else if( n.equals( "b" ) || n.equals("strong") )
                {
                    m_out.print( "__" );
                    print( e );
                    m_out.print( "__" );
                }
                else if( n.equals( "i" ) || n.equals("em") || n.equals( "address" ) )
                {
                    m_out.print( "''" );
                    print( e );
                    m_out.print( "''" );
                }
                else if( n.equals( "u" ) )
                {
                    m_out.print( "%%( text-decoration:underline; )" );
                    print( e );
                    m_out.print( "%%" );
                }
                else if( n.equals( "strike" ) )
                {
                    m_out.print( "%%strike " );
                    print( e );
                    m_out.print( "%%" );
                    // NOTE: don't print a space before or after the double percents because that can break words into two.
                    // For example: %%(color:red)ABC%%%%(color:green)DEF%% is different from %%(color:red)ABC%% %%(color:green)DEF%%
                }
                else if( n.equals( "sup" ) )
                {
                    m_out.print( "%%sup " );
                    print( e );
                    m_out.print( "%%" );
                }
                else if( n.equals( "sub" ) )
                {
                    m_out.print( "%%sub " );
                    print( e );
                    m_out.print( "%%" );
                }
                else if( n.equals("dl") )
                {
                    m_out.print( "\n" );
                    print( e );

                    // print a newline after the definition list. If we don't,
                    // it may cause problems for the subsequent element.
                    m_out.print( "\n" );
                }
                else if( n.equals("dt") )
                {
                    m_out.print( ";" );
                    print( e );
                }
                else if( n.equals("dd") )
                {
                    m_out.print( ":" );
                    print( e );
                }
                else if( n.equals( "ul" ) )
                {
                    m_out.println();
                    m_liStack.push( "*" );
                    print( e );
                    m_liStack.pop();
                }
                else if( n.equals( "ol" ) )
                {
                    m_out.println();
                    m_liStack.push( "#" );
                    print( e );
                    m_liStack.pop();
                }
                else if( n.equals( "li" ) )
                {
                    m_out.print( m_liStack + " " );
                    print( e );

                    // The following line assumes that the XHTML has been "pretty-printed"
                    // (newlines separate child elements from their parents).
                    boolean lastListItem = base.indexOf( e ) == ( base.getContentSize() - 2 );
                    boolean sublistItem = m_liStack.toString().length() > 1;

                    // only print a newline if this <li> element is not the last item within a sublist.
                    if( !sublistItem || !lastListItem )
                    {
                        m_out.println();
                    }
                }
                else if( n.equals( "pre" ) )
                {
                    m_out.print( "\n{{{" ); // start JSPWiki "code blocks" on its own line
                    m_preStack.push();
                    print( e );
                    m_preStack.pop();

                    // print a newline after the closing braces
                    // to avoid breaking any subsequent wiki markup that follows.
                    m_out.print( "}}}\n" );
                }
                else if( n.equals( "code" ) || n.equals( "tt" ) )
                {
                    m_out.print( "{{" );
                    m_preStack.push();
                    print( e );
                    m_preStack.pop();
                    m_out.print( "}}" );
                    // NOTE: don't print a newline after the closing brackets because if the Text is inside
                    // a table or list, it would break it if there was a subsequent row or list item.
                }
                else if( n.equals( "img" ) )
                {
                    if( !isIgnorableWikiMarkupLink( e ) )
                    {
                        m_out.print( "[" );
                        print( trimLink( e.getAttributeValue( "src" ) ) );
                        m_out.print( "]" );
                    }
                }
                else if( n.equals( "form" ) )
                {
                    // remove the hidden input where name="formname" since a new one will be generated again when the xhtml is rendered.
                    Element formName = (Element)XPath.selectSingleNode( e, "INPUT[@name='formname']" );
                    if( formName != null )
                    {
                        formName.detach();
                    }

                    String name = e.getAttributeValue( "name" );

                    m_out.print( "\n[{FormOpen" );
View Full Code Here

TOP

Related Classes of org.wltea.analyzer.lucene.IKQueryParser$ExpressionParser$Element

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.