/*
* Copyright (C) Chaperon. All rights reserved.
* -------------------------------------------------------------------------
* This software is published under the terms of the Apache Software License
* version 1.1, a copy of which has been included with this distribution in
* the LICENSE file.
*/
package net.sourceforge.chaperon.process;
import net.sourceforge.chaperon.common.Decoder;
import org.apache.commons.logging.Log;
import org.xml.sax.*;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.LocatorImpl;
/**
* The processor convert a stream text into lexical tokens, like a tokenizer.
*
* @author <a href="mailto:stephan@apache.org">Stephan Michels </a>
* @version CVS $Id: LexicalProcessor.java,v 1.22 2004/01/04 16:54:34 benedikta Exp $
*/
public class LexicalProcessor implements ContentHandler, LexicalHandler
{
public static final String NS = "http://chaperon.sourceforge.net/schema/text/1.0";
public static final String TEXT = "text";
public static final String NS_OUTPUT = "http://chaperon.sourceforge.net/schema/lexer/2.0";
public static final String OUTPUT = "output";
public static final String LEXEME = "lexeme";
public static final String GROUP = "group";
public static final String ERROR = "error";
private ContentHandler contentHandler = null;
private LexicalHandler lexicalHandler = null;
private static final int STATE_OUTSIDE = 0;
private static final int STATE_TEXT = 1;
private int state = STATE_OUTSIDE;
private Locator locator = null;
private LocatorImpl locatorImpl = null;
private LexicalAutomaton automaton = null;
private Log log = null;
private boolean grouping = false;
private boolean localizable = false;
private String source;
private int lineNumber;
private int columnNumber;
private StringBuffer buffer = null;
private char[] text = null;
/**
* Create a new lexical processor.
*/
public LexicalProcessor() {}
/**
* Create a new lexical processor.
*
* @param automaton Lexical automaton, which should be used.
* @param handler Handler, which should receives the events.
*/
public LexicalProcessor(LexicalAutomaton automaton)
{
this.automaton = automaton;
}
/**
* Set the lexical automaton, which the processor should use.
*
* @param automaton Lexical automaton, which should be used.
*/
public void setLexicalAutomaton(LexicalAutomaton automaton)
{
this.automaton = automaton;
}
/**
* Set the <code>ContentHandler</code> that will receive XML data.
*/
public void setContentHandler(ContentHandler handler)
{
this.contentHandler = handler;
}
/**
* Set the <code>LexicalHandler</code> that will receive XML data.
*/
public void setLexicalHandler(LexicalHandler handler)
{
this.lexicalHandler = handler;
}
/**
* Set the log, which should be used.
*
* @param log Log.
*/
public void setLog(Log log)
{
this.log = log;
}
public void setGrouping(boolean grouping)
{
this.grouping = grouping;
}
public void setLocalizable(boolean localizable)
{
this.localizable = localizable;
}
/**
* Receive an object for locating the origin of SAX document events.
*/
public void setDocumentLocator(Locator locator)
{
this.locator = locator;
this.locatorImpl = null;
if (locator!=null)
{
this.locatorImpl = new LocatorImpl(locator);
contentHandler.setDocumentLocator(locatorImpl);
}
}
/**
* Receive notification of the beginning of a document.
*/
public void startDocument() throws SAXException
{
if (locatorImpl!=null)
{
locatorImpl.setLineNumber(locator.getLineNumber());
locatorImpl.setColumnNumber(locator.getColumnNumber());
}
contentHandler.startDocument();
state = STATE_OUTSIDE;
buffer = new StringBuffer();
}
/**
* Receive notification of the beginning of an element.
*/
public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
throws SAXException
{
if (state==STATE_OUTSIDE)
{
if ((namespaceURI!=null) && (namespaceURI.equals(NS)) && (localName.equals(TEXT)))
{
state = STATE_TEXT;
buffer = new StringBuffer();
if (atts.getValue("source")!=null)
source = atts.getValue("source");
else if (locator!=null)
source = locator.getSystemId();
else
source = "unknown";
if (atts.getValue("column")!=null)
columnNumber = Integer.parseInt(atts.getValue("column"));
else if (locator!=null)
columnNumber = locator.getColumnNumber();
else
columnNumber = 1;
if (atts.getValue("line")!=null)
lineNumber = Integer.parseInt(atts.getValue("line"));
else if (locator!=null)
lineNumber = locator.getLineNumber();
else
lineNumber = 1;
}
else
contentHandler.startElement(namespaceURI, localName, qName, atts);
}
else if (state==STATE_TEXT)
throw new SAXException("Unexpected start element '"+qName+"'.");
}
/**
* Receive notification of character data.
*/
public void characters(char[] ch, int start, int length)
throws SAXException
{
if (state==STATE_OUTSIDE)
contentHandler.characters(ch, start, length);
else if (state==STATE_TEXT)
buffer.append(ch, start, length);
}
/**
* Receive notification of ignorable whitespace in element content.
*/
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException
{
if (state==STATE_OUTSIDE)
contentHandler.characters(ch, start, length);
else if (state==STATE_TEXT)
buffer.append(ch, start, length);
}
/**
* Receive notification of the end of an element.
*/
public void endElement(String namespaceURI, String localName, String qName)
throws SAXException
{
if (state==STATE_OUTSIDE)
contentHandler.endElement(namespaceURI, localName, qName);
else if (state==STATE_TEXT)
{
if ((namespaceURI!=null) && (namespaceURI.equals(NS)) && (localName.equals(TEXT)))
{
state = STATE_OUTSIDE;
handleEndDocument();
}
else
throw new SAXException("Unexpected end element '"+qName+"'.");
}
}
/**
* Begin the scope of a prefix-URI Namespace mapping.
*/
public void startPrefixMapping(String prefix, String uri)
throws SAXException
{
if (locatorImpl!=null)
{
locatorImpl.setLineNumber(locator.getLineNumber());
locatorImpl.setColumnNumber(locator.getColumnNumber());
}
contentHandler.startPrefixMapping(prefix, uri);
}
/**
* End the scope of a prefix-URI mapping.
*/
public void endPrefixMapping(String prefix) throws SAXException
{
if (locatorImpl!=null)
{
locatorImpl.setLineNumber(locator.getLineNumber());
locatorImpl.setColumnNumber(locator.getColumnNumber());
}
contentHandler.endPrefixMapping(prefix);
}
/**
* Receive notification of a processing instruction.
*/
public void processingInstruction(String target, String data)
throws SAXException
{
if (locatorImpl!=null)
{
locatorImpl.setLineNumber(locator.getLineNumber());
locatorImpl.setColumnNumber(locator.getColumnNumber());
}
if (state==STATE_OUTSIDE)
contentHandler.processingInstruction(target, data);
}
/**
* Receive notification of a skipped entity.
*/
public void skippedEntity(String name) throws SAXException
{
if (locatorImpl!=null)
{
locatorImpl.setLineNumber(locator.getLineNumber());
locatorImpl.setColumnNumber(locator.getColumnNumber());
}
if (state==STATE_OUTSIDE)
contentHandler.skippedEntity(name);
}
/**
* Receive notification of the end of a document.
*/
public void endDocument() throws SAXException
{
if (locatorImpl!=null)
{
locatorImpl.setLineNumber(locator.getLineNumber());
locatorImpl.setColumnNumber(locator.getColumnNumber());
}
if (state==STATE_OUTSIDE)
contentHandler.endDocument();
}
/**
* Report the start of DTD declarations, if any.
*/
public void startDTD(String name, String publicId, String systemId)
throws SAXException
{
if (lexicalHandler!=null)
lexicalHandler.startDTD(name, publicId, systemId);
}
/**
* Report the end of DTD declarations.
*/
public void endDTD() throws SAXException
{
if (lexicalHandler!=null)
lexicalHandler.endDTD();
}
/**
* Report the beginning of an entity.
*/
public void startEntity(String name) throws SAXException
{
if (lexicalHandler!=null)
lexicalHandler.startEntity(name);
}
/**
* Report the end of an entity.
*/
public void endEntity(String name) throws SAXException
{
if (lexicalHandler!=null)
lexicalHandler.endEntity(name);
}
/**
* Report the start of a CDATA section.
*/
public void startCDATA() throws SAXException
{
if (lexicalHandler!=null)
lexicalHandler.startCDATA();
}
/**
* Report the end of a CDATA section.
*/
public void endCDATA() throws SAXException
{
if (lexicalHandler!=null)
lexicalHandler.endCDATA();
}
/**
* Report an XML comment anywhere in the document.
*/
public void comment(char[] ch, int start, int len) throws SAXException
{
if (lexicalHandler!=null)
lexicalHandler.comment(ch, start, len);
}
/**
* Receives the notification, that the text stream ended.
*/
public void handleEndDocument() throws SAXException
{
PatternProcessor processor = new PatternProcessor();
text = buffer.toString().toCharArray();
int position = 0;
if (locatorImpl!=null)
{
locatorImpl.setSystemId(source);
locatorImpl.setLineNumber(lineNumber);
locatorImpl.setColumnNumber(columnNumber);
}
contentHandler.startPrefixMapping("", NS_OUTPUT);
AttributesImpl atts = new AttributesImpl();
if (localizable)
atts.addAttribute("", "source", "source", "CDATA", source);
contentHandler.startElement(NS_OUTPUT, OUTPUT, OUTPUT, new AttributesImpl());
StringBuffer unrecognized = new StringBuffer();
while (position<text.length)
{
String tokensymbol = null;
String tokentext = null;
for (int lexemeindex = automaton.getLexemeCount()-1; lexemeindex>=0; lexemeindex--)
{
processor.setPatternAutomaton(automaton.getLexemeDefinition(lexemeindex));
if ((processor.match(text, position)) &&
((tokentext==null) || (processor.getGroup().length()>=tokentext.length())))
{
tokensymbol = automaton.getLexemeSymbol(lexemeindex);
tokentext = processor.getGroup();
}
}
if ((tokentext!=null) && (tokentext.length()==0))
log.warn("Lexical processor recognized empty lexeme '"+tokensymbol+"'");
if ((tokentext!=null) && (tokentext.length()>0))
{
if (unrecognized.length()>0)
{
if (log!=null)
log.debug("Text was not recognized "+Decoder.toString(unrecognized.toString()));
atts = new AttributesImpl();
atts.addAttribute("", "text", "text", "CDATA", unrecognized.toString());
if (localizable)
{
atts.addAttribute("", "line", "line", "CDATA", String.valueOf(lineNumber));
atts.addAttribute("", "column", "column", "CDATA", String.valueOf(columnNumber));
}
contentHandler.startElement(NS_OUTPUT, ERROR, ERROR, atts);
contentHandler.endElement(NS_OUTPUT, ERROR, ERROR);
increasePosition(position-unrecognized.length(), unrecognized.length());
unrecognized = new StringBuffer();
}
if (tokensymbol!=null)
{
if (log!=null)
log.debug("Recognize token "+tokensymbol+" with "+Decoder.toString(tokentext));
if (locatorImpl!=null)
{
locatorImpl.setLineNumber(locator.getLineNumber());
locatorImpl.setColumnNumber(locator.getColumnNumber());
}
atts = new AttributesImpl();
atts.addAttribute("", "symbol", "symbol", "CDATA", tokensymbol);
atts.addAttribute("", "text", "text", "CDATA", tokentext);
if (localizable)
{
atts.addAttribute("", "line", "line", "CDATA", String.valueOf(lineNumber));
atts.addAttribute("", "column", "column", "CDATA", String.valueOf(columnNumber));
}
contentHandler.startElement(NS_OUTPUT, LEXEME, LEXEME, atts);
if (grouping)
for (int i = 1; i<processor.getGroupCount(); i++)
{
AttributesImpl groupatts = new AttributesImpl();
groupatts.addAttribute("", "text", "text", "CDATA", processor.getGroup(i));
contentHandler.startElement(NS_OUTPUT, GROUP, GROUP, groupatts);
contentHandler.endElement(NS_OUTPUT, GROUP, GROUP);
}
contentHandler.endElement(NS_OUTPUT, LEXEME, LEXEME);
}
else if (log!=null)
log.debug("Ignore lexeme with "+Decoder.toString(tokentext));
if (locatorImpl!=null)
{
locatorImpl.setColumnNumber(columnNumber);
locatorImpl.setLineNumber(lineNumber);
}
position += tokentext.length();
increasePosition(position-tokentext.length(), tokentext.length());
}
else
{
if (locatorImpl!=null)
{
locatorImpl.setColumnNumber(columnNumber);
locatorImpl.setLineNumber(lineNumber);
}
unrecognized.append(text[position]);
position++;
}
}
if (unrecognized.length()>0)
{
if (log!=null)
log.debug("Text was not recognized "+Decoder.toString(unrecognized.toString()));
atts = new AttributesImpl();
atts.addAttribute("", "text", "text", "CDATA", unrecognized.toString());
if (localizable)
{
atts.addAttribute("", "line", "line", "CDATA", String.valueOf(lineNumber));
atts.addAttribute("", "column", "column", "CDATA", String.valueOf(columnNumber));
}
contentHandler.startElement(NS_OUTPUT, ERROR, ERROR, atts);
contentHandler.endElement(NS_OUTPUT, ERROR, ERROR);
System.out.println("push \""+unrecognized.toString()+"\"");
increasePosition(position-unrecognized.length(), unrecognized.length());
}
if (locatorImpl!=null)
{
locatorImpl.setLineNumber(locator.getLineNumber());
locatorImpl.setColumnNumber(locator.getColumnNumber());
}
contentHandler.endElement(NS_OUTPUT, OUTPUT, OUTPUT);
contentHandler.endPrefixMapping("");
}
private void increasePosition(int position, int length)
{
for (int i = position; i<(position+length); i++)
{
if (text[i]=='\n')
{
columnNumber = 1;
lineNumber++;
}
else if ((text[i]=='\r') && ((i==(text.length-1)) || (text[i+1]!='\n')))
{
columnNumber = 1;
lineNumber++;
}
else
columnNumber++;
}
}
}