Source Code of org.apache.flex.compiler.internal.parsing.mxml.MXMLTokenizer

/*
 *
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */


package org.apache.flex.compiler.internal.parsing.mxml;


import java.io.Closeable;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;


import org.apache.commons.io.IOUtils;


import org.apache.flex.compiler.common.MutablePrefixMap;
import org.apache.flex.compiler.common.PrefixMap;
import org.apache.flex.compiler.filespecs.FileSpecification;
import org.apache.flex.compiler.filespecs.IFileSpecification;
import org.apache.flex.compiler.internal.parsing.as.ASTokenTypes;
import org.apache.flex.compiler.parsing.IMXMLToken;
import org.apache.flex.compiler.parsing.IMXMLTokenizer;
import org.apache.flex.compiler.parsing.MXMLTokenTypes;
import org.apache.flex.compiler.problems.ICompilerProblem;
import org.apache.flex.compiler.problems.InternalCompilerProblem2;
import org.apache.flex.utils.NonLockingStringReader;


/**
 * Tokenizes MXML files.  Uses RawTagTokenizer to get basic tokens.  Ignores comments (<!--...-->),
 * processing instructions (<?...?>), and whitespace.  Replaces CDATA tokens with text tokens (strips
 * out the cdata stuff.
 */
public class MXMLTokenizer implements IMXMLTokenizer, Closeable
{
  /**
   * Start offset (for when you're parsing a section of the document that
   * doesn't start at the beginning)
   */
  protected int startOffset;
  
  private int tagDepth = -1;
  
  /**
   * Specifies that we are within a tags content, ie inside &lt; and &gt;
   */
  private boolean inTagContent = false;


  private RawMXMLTokenizer tokenizer;
  
  protected MXMLToken xmlNSToken = null;


  protected MutablePrefixMap rootPrefixMap;


  private MXMLToken postRepairToken = null; 
  
  private boolean isRepairing = true;
  
  private boolean wasRepaired = false;
  
  private static final int SIZE = 100;
  
  private List<ICompilerProblem> problems;


    private String path;
    
    private MXMLToken lastToken = null;
    
    private static final String SUB_SYSTEM = "MXMLTokenizer";


  /**
   * Constructor
   */
  public MXMLTokenizer(String path)
  {
      tokenizer = new RawMXMLTokenizer();
        problems = new ArrayList<ICompilerProblem>();
        rootPrefixMap = new MutablePrefixMap();
        this.path = path;
  }
  
  public MXMLTokenizer() {
      this("");
  }
  
  public MXMLTokenizer(IFileSpecification specification) {
      this(specification.getPath());
  }
  
  /**
   * Reparse constructor.  Allows you to start the tokenizer with a start
   * offset (for when you're parsing a section of the document that doesn't
   * start at the beginning).
   * @param startOffset    Start offset
   */
  public MXMLTokenizer(int startOffset)
  {
    this("");
      this.startOffset = startOffset;
  }
  
  public void setPath(String path) {
      this.path = path;
  }
  
  public void setReader(Reader reader) {
      tokenizer.reset();
        tokenizer.yyreset(reader);
  }
  
    @Override
    public void close() throws IOException
    {
        if (tokenizer != null)
        {
            tokenizer.reset();
            tokenizer.yyclose(); //close the reader
        }
    }
    
    /**
   * If it exists, return the PrefixMap from the last parse
   * @return a {@link PrefixMap} or null
   */
  public PrefixMap getPrefixMap() {
    return rootPrefixMap;
  }
  
  /**
   * Sets a flag to indicate whether this tokenizer should try to repair its token stream
   * @param isRepairing <code>true</code> to repair, <code>false</code> to not repair
   */
  @Override
    public void setIsRepairing(boolean isRepairing) {
    this.isRepairing = isRepairing;
  }
  
  @Override
    public IMXMLToken[] getTokens(Reader reader) {
    List<MXMLToken> parseTokens = parseTokens(reader);
    return parseTokens.toArray( new IMXMLToken[0]);
  }


  @Override
    public IMXMLToken[] getTokens(String range) {
    List<MXMLToken> parseTokens = parseTokens(new NonLockingStringReader(range));
    return parseTokens.toArray( new IMXMLToken[0]);
  }
  
  /**
   * Determines if the the tokenizer has encountered any problems as it lexed the given input
   * @return true if we have encountered any problems
   */
  public boolean hasTokenizationProblems() {
    return tokenizer.hasProblems() || problems.size() > 0;
  }
  
  /**
   * Processes the given input and builds a {@link PrefixMap} for the root tag found within this document
   */
  public PrefixMap getRootTagPrefixMap() {
    boolean cont = true;
    do {
      MXMLToken token = nextToken();
      if(token == null || token.isTagEnd()) {
        cont = false;
      }
    } while(cont);
    return rootPrefixMap;
  }


  /**
   * Returns a collection of problems encountered while processing the given input
   * @return a {@link Collection} of {@link ICompilerProblem} objects, or an empty {@link Collection}
   */
  public List<ICompilerProblem> getTokenizationProblems() {
    ArrayList<ICompilerProblem> problems = new ArrayList<ICompilerProblem>(this.problems);
    problems.addAll(tokenizer.getProblems());
    return problems;
  }
  
  /**
     * Returns the next token that can be produced from the given input, without performing any repair code
     * @return an {@link MXMLToken} or null when no more tokens can be produced
     */
  private final MXMLToken nextTokenInternal() {
      MXMLToken retVal = null;
        boolean cont = true;
        while(cont) {
            try
            {
                MXMLToken token = tokenizer.hasBufferToken() ? (MXMLToken)tokenizer.getBufferToken() : (MXMLToken)tokenizer.nextToken();
                if(token == null)
                    return null;
                MXMLToken mxmlToken = processToken(token);
                if(mxmlToken != null) {
                    retVal = mxmlToken;
                    return retVal;
                }
            }
            catch (Exception e)
            {
                ICompilerProblem problem = new InternalCompilerProblem2(path, e, SUB_SYSTEM); 
                problems.add(problem);
                return null;
            }
            catch (Error e)
            {
                ICompilerProblem problem = new InternalCompilerProblem2(path, e, SUB_SYSTEM); 
                problems.add(problem);
                return null;
            }
        }
        return null;
  }


  
  /**
   * Returns the next token that can be produced from the given input
   * @return an {@link MXMLToken} or null when no more tokens can be produced
   */
  public MXMLToken nextToken() {
      if(isRepairing) {
          if(postRepairToken != null) {
                MXMLToken retVal = postRepairToken;
                postRepairToken = null;
                return retVal;
            }
          MXMLToken mxmlToken = nextTokenInternal();
            MXMLToken addedToken = analyzeForEndTagProblems(mxmlToken);
            if(addedToken != null) {
                postRepairToken = mxmlToken;
                wasRepaired = true;   
                return addedToken;
            }
            return mxmlToken;
      }
      return nextTokenInternal();
  }


  /**
   * Parse the contents of input
   * @param input    Reader containing file to be parsed
   * @return      List of MXMLTokens
   */
  public List<MXMLToken> parseTokens(Reader input) {
    // Add fake characters onto the end of the stream to make it easier to handle
    // unclosed constructs like <![CDATA[ and <!--.
    wasRepaired = false;
    setReader(input);
    // Set the start offset in the tokenizer
    // This is done after setReader() as setReader() resets the tokenizer, setting yychar to 0
    tokenizer.setOffset(startOffset);
    MXMLToken token = null;
    List<MXMLToken> list = new ArrayList<MXMLToken>(SIZE);
    try {
      do {
          token = nextToken();
          if(token != null)
              buildTokenList((MXMLToken)token.clone(), list);
          
      }while(token != null);
      lastToken = null;
      return list;
    } finally {
      try {
        tokenizer.yyclose();
      } catch (IOException e) {
          ICompilerProblem problem = new InternalCompilerProblem2(path, e, SUB_SYSTEM);
          problems.add(problem);
      }
    }
  }
  
  // TODO: remove this. It now does nothing. See note below
  private MXMLToken analyzeForEndTagProblems(MXMLToken currentToken) {
        if(currentToken == null)
            return null;
        try {
            
            if(currentToken.isTagStart() && lastToken != null) {
                switch(lastToken.getType()) {
                    case MXMLTokenTypes.TOKEN_WHITESPACE:
                    case MXMLTokenTypes.TOKEN_PROCESSING_INSTRUCTION:
                    case MXMLTokenTypes.TOKEN_COMMENT:
                    case MXMLTokenTypes.TOKEN_ASDOC_COMMENT:
                    case MXMLTokenTypes.TOKEN_STRING:
                    case MXMLTokenTypes.TOKEN_TEXT:
                    case MXMLTokenTypes.TOKEN_CDATA:
                    case MXMLTokenTypes.TOKEN_TAG_END:
                    case MXMLTokenTypes.TOKEN_EMPTY_TAG_END:
                    case -1:
                        return null; //all legal to come before open tag start
                    default:
                        // turn off this logic that makes up a fake token. The MXMLData already
                        // known how do to this. And if we do it here, we lose the information that the repair
                        // was done. Since we actually care, this causes bugs.
                        return null;
                      
                }
            } 
            return null;
        }
        finally
        {
            lastToken = currentToken;
        }
    }
  
  /**
   * Determines if any tokens were added as a side effect of repair.  This can only be called after a tokenize call
   * @return true if the token stream was modified
   */
  public boolean tokensWereRepaired() {
    return wasRepaired;
  }
  
  /**
   * Processes tokens, performs various transforms on the tokens that we return, such as:
   * <ul>
   * <li>transform XMLNS style tokens to name tokens for easier consumption by clients</li>
   * <li>filter out state combiner tokens</li>
   * <li>track xmlns string values</li>
   * </ul>
   * Note that we don't modify/merge whitespace and text tokens here as there are a number
   * of tests which are sensitive to whitespace, ie MetaMXMLSuite.
   * @param token
   * @return an {@link MXMLToken} or null if it was not accepted
   */
  private MXMLToken processToken(final MXMLToken token) {
      //TODO find xmlns uri values in the lexer instead of here
      switch (token.getType())
        {
            // tags (and also DTD directives)
            case MXMLTokenTypes.TOKEN_OPEN_TAG_START:
              tagDepth++;
              inTagContent = true;
              return token;
            case MXMLTokenTypes.TOKEN_CLOSE_TAG_START:
              tagDepth--;
              inTagContent = true;
              return token;
            case MXMLTokenTypes.TOKEN_TAG_END:
            case MXMLTokenTypes.TOKEN_EMPTY_TAG_END:
                inTagContent = false;
                return token;
            // stuff inside tags
            case MXMLTokenTypes.TOKEN_EQUALS:
            //outside tags
            case MXMLTokenTypes.TOKEN_CDATA:
                return token;
            case MXMLTokenTypes.TOKEN_NAME:
                xmlNSToken = null;
                return token;
            case MXMLTokenTypes.TOKEN_XMLNS:
                token.setType(MXMLTokenTypes.TOKEN_NAME);
                xmlNSToken = token;
                return token;
            case MXMLTokenTypes.TOKEN_STRING:
                //if the current namespace we are tracking is not null, then this string should yield the namespace URI
              //only track the namespace of the root document
              if(xmlNSToken != null && tagDepth == 0) {
                    String prefix = "";
                    String text = xmlNSToken.getText();
                    if(text.length() > 5) { //has prefix
                        prefix = text.substring(6);
                    }
                    String nsText = token.getText();
                    String ns = nsText.length() > 1 ? 
                            nsText.substring(1, nsText.length() -1) : "";
                    rootPrefixMap.add(prefix, ns);
                }
                return token;
            // stuff outside tags
            default:
            {
                if(tagDepth != 0 && !tokenizer.isInE4XDatabinding() && !inTagContent) {
                    //probably mixed content.  Allow it and let it fail downstream if we're wrong
                    if(token.isLiteral() || token.getType() == ASTokenTypes.TOKEN_IDENTIFIER) {
                        token.setType(MXMLTokenTypes.TOKEN_TEXT);
                    }
                }
                return token;
            }
        }
  }


  /**
   * Handles the addition of tokens to the internal token list.  Subclasses should override this method to handle
   * different tokenizing strategies
   * @param token The current token.
   * @param list The list of tokens being built.
   */
  protected void buildTokenList(MXMLToken token, List<MXMLToken> list)
  {
    if(token != null) {
        list.add(token);
    }
  }
  
  public static void main(String[] args)
  {
        final FileSpecification fileSpec = new FileSpecification(args[0]);
        
        final MXMLTokenizer tokenizer = new MXMLTokenizer(fileSpec.getPath());
        try
        {
            List<MXMLToken> tokens = tokenizer.parseTokens(fileSpec.createReader());
            for (MXMLToken token : tokens)
            {
                System.out.println(token.toDumpString());
            }
        }
        catch (FileNotFoundException e)
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        finally
        {
            IOUtils.closeQuietly(tokenizer);
        }
  }
}
Source Code of org.apache.flex.compiler.internal.parsing.mxml.MXMLTokenizer

Related Classes of org.apache.flex.compiler.internal.parsing.mxml.MXMLTokenizer