InputStream in = UtilExtract.getStream(info.getUri());
final LinkedList links = new LinkedList();
//define an DocNode
final DocumStruct doc = new DocumStruct();
//use SAX-Parser instead of DOM-Parser, Performance issue
//get a reader to the data using sax
try {
// Create a JAXP "parser factory" for creating SAX parsers
javax.xml.parsers.SAXParserFactory saxFactory = SAXParserFactory.newInstance();
// Configure the parser factory for the type of parsers we require
saxFactory.setValidating(false); // No validation required
// Now use the parser factory to create a SAXParser object
// Note that SAXParser is a JAXP class, not a SAX class
javax.xml.parsers.SAXParser saxParser = saxFactory.newSAXParser();
// Create a SAX input source for the file argument
org.xml.sax.InputSource input = new InputSource(in);
//create the stack
final Stack nodeStack = new Stack();
//final NodeStruct actual = new NodeStruct();
// final DefaultMutableTreeNode tree = new DefaultMutableTreeNode();
//define an internal stack
IXMLController controller = new IXMLController() {
// (non-Javadoc)
// @see net.fp.rp.back.extractor.xml.IXMLController#handleContent(java.lang.String)
//
public void handleContent(final String name,
final String content) throws SAXException {
//split and add the content
logger.debug("XMLProcessing - handle the content " +
content + " for the name " + name);
NodeStruct actual = (NodeStruct) nodeStack.pop();
boolean isTupleValueEmpty = false;
//if the node has only one tuple (special case)
if (actual.getTuples().size() == 1) {
TupleStruct tuple = (TupleStruct) actual.getTuples()
.get(0);
//last element with empty value ??
if ((tuple.getKeyword().equals(name)) &&
("".equals(tuple.getValue()))) {
//update the value
logger.debug("XMLProcessing - handle the node " +
name + "which has before empty value");
isTupleValueEmpty = true;
}
}
//ignore the empty contents
if (content.length() > 0) {
if (isTupleValueEmpty) {
//update the value
((TupleStruct) actual.getTuples().get(0)).setValue(content);
} else {
actual.addTuple(name, content);
}
logger.debug("Tag name/value is " + name + "/" +
content);
//validate if the specified tuple is a link
if (linkRequired && (linkTags.indexOf(name) != -1)) {
//add the specified link to the list
logger.debug("Tag name is a link" + name + "/" +
content);
links.add(content);
}
}
if (nodeStack.isEmpty()) {
logger.debug(
"XMLProcessing - set the content actual node as content for document");
doc.setContent(actual);
}
}
// (non-Javadoc)
// @see net.fp.rp.back.extractor.xml.IXMLController#handleElemAttributes(java.lang.String, org.xml.sax.AttributeList)
//
public void handleElemAttributes(String name,
AttributeList attributes) throws SAXException {
logger.debug(
"XMLProcessing - handle the element attributes for name " +
name);
//create the node struct
NodeStruct actual = new NodeStruct();
//if exists attributes handle as value for the element:name
if (attributes.getLength() > 0) {
//iterate on attributes and added as value
// StringBuffer buf = new StringBuffer();
for (int i = 0; i < attributes.getLength(); i++) {
//Encode the attrib. buffer (for the attributes maybe is not necessarilly)
actual.addTuple(attributes.getName(i),
UtilExtract.encode(attributes.getValue(i)));
}
}
//add the actual node
actual.addTuple(name, "");
if (!nodeStack.isEmpty()) {
//get the parent
((NodeStruct) nodeStack.get(nodeStack.size() - 1)).addChild(actual);
}
nodeStack.push(actual);
}
};
SaxXMLBuilder builder = new SaxXMLBuilder(controller);
//parse the input and notify the handler
saxParser.parse(input, builder);
} catch (SAXException e) {
logger.debug("SAXException in processing location" + info.getUri(),
e);
throw new RpException("extractor.xml.filenotvalid",
new Object[] { info.getUri() });
} catch (Throwable t) {
logger.debug("Exception in processing the location" +
info.getUri(), t);
throw new RpException("app.extract.error",
new Object[] { info.getUri() });
} finally {
try {
if (in != null) {
in.close();
}
} catch (IOException e) {
}
}
/*
try
{
DOMParser parser = new DOMParser();
parser.parse( new InputSource( in ) );
Document xmldoc = parser.getDocument();
//parse the document and generate the conent nodes
doc.setContent( Translator.translate( xmldoc.getDocumentElement() ) );
}
catch ( SAXException e )
{
e.printStackTrace(System.out);
}
catch ( IOException e )
{
e.printStackTrace(System.out);
}
*/
//add the document to the list
doc.setPath(info.getUri());
doc.setTitle(UtilExtract.getFilenameTitle(info.getUri()));
//get the summary of the document
StringBuffer summary = new StringBuffer("");
boolean isMaxReached = false;
NodeStruct node = doc.getContent();
for (int i = 0; (i < node.getTuples().size()) && (!isMaxReached);
i++) {
TupleStruct tuple = (TupleStruct) node.getTuples().get(i);
//add to the summary
if (summary.length() <= getMaxLengthSummary()) {
summary.append(tuple.getValue());
summary.append(" ");
}
if (summary.length() > getMaxLengthSummary()) {
isMaxReached = true;
}
}
if (isMaxReached) {
doc.setDescription(summary.toString().substring(0,
getMaxLengthSummary()));
} else {
doc.setDescription(summary.toString());
}
doc.setCategoryName(info.getCategoryName());
doc.setCategoryLocation(info.getCategoryLocation());
//store and reindex document
PluginManager.storeAndAddDocument(doc);
logger.debug("Level of the information is " + info.getLevel());