/*******************************************************************************
* Copyright 2006 - 2012 Vienna University of Technology,
* Department of Software Technology and Interactive Systems, IFS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
******************************************************************************/
package eu.scape_project.planning.xml;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Namespace;
import org.dom4j.QName;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import eu.scape_project.planning.model.FormatInfo;
import eu.scape_project.planning.model.SampleObject;
import eu.scape_project.planning.utils.ParserException;
/**
* A simple parser for the c3po profile.
*
* @author Petar Petrov - <me@petarpetrov.org>
*
*/
public class C3POProfileParser {
/**
* The default namespace of a c3po profile.
*/
private static final String C3PO_NAMESPACE = "http://ifs.tuwien.ac.at/dp/c3po";
/**
* A template for the description of the partition.
*/
private static final String TYPE_OF_OBJECTS_BEGIN = "The collection consists of {1}% '{2}' files. ";
/**
* A template for the second most prominent type of objects in the
* partition.
*/
private static final String TYPE_OF_OBJECTS_SECOND = "It also contains {1}% '{2}' files. ";
/**
* A template for the conflicting objects in the partition.
*/
private static final String TYPE_OF_OBJECTS_CONFLICTS = "{1}% files have conflicts. ";
/**
* A template for the unknown objects in the partition.
*/
private static final String TYPE_OF_OBJECTS_UNKNOWN = "{1}% files have an unknown format. ";
/**
* A constant if the format distribution is missing.
*/
private static final String MISSING = "No format distribution provided";
/**
* A template for the 'description of objects' field.
*/
private static final String DESCRIPTION_OF_SAMPLES_TEMPLATE = "The sample objects were chosen by c3po using the {1} algorithm.";
private static final Logger log = LoggerFactory.getLogger(C3POProfileParser.class);
private Document profile;
private Namespace namespace;
/**
* Reads the profile out of the input stream and validates if needed. If the
* document is faulty for some reason and exception will be thrown.
*
* @param stream
* the stream of the c3po xml profile to read.
* @param validate
* whether or not to validate the xml.
* @throws ParserException
* if some error occurrs.
*/
public void read(final InputStream stream, boolean validate) throws ParserException {
ValidatingParserFactory vpf = new ValidatingParserFactory();
SAXParser parser = null;
try {
parser = vpf.getValidatingParser();
} catch (ParserConfigurationException e) {
log.error("An error occurred while parsing the c3po profile: {}", e.getMessage());
} catch (SAXException e) {
log.error("An error occurred while parsing the c3po profile: {}", e.getMessage());
}
if (validate && !this.isValid(parser, stream)) {
throw new ParserException("Validation was turned on, but the xml file is not valid against the schema.");
}
try {
final SAXReader reader = new SAXReader();
this.profile = reader.read(stream);
final Namespace namespace = this.profile.getRootElement().getNamespace();
if (!namespace.getStringValue().equals(C3PO_NAMESPACE)) {
throw new ParserException("Cannot parse the profile, namespace does not match");
}
} catch (final DocumentException e) {
log.error("An error occurred while reading the profile: {}", e.getMessage());
this.profile = null;
}
try {
stream.close();
} catch (final IOException e) {
log.error("An error occurred while closing the input stream: {}", e.getMessage());
}
}
/**
* Gets the collection identifier.
*
* @return the id.
*/
public String getCollectionId() {
return this.profile.getRootElement().attributeValue("collection");
}
/**
* Gets the partition filter key (used by c3po). Note that only the first
* partition is used.
*
* @return the partition filter key.
*/
public String getPartitionFilterKey() {
return this.profile.getRootElement().element("partition").element("filter").attributeValue("id");
}
/**
* Gets the objects count in the partition.
*
* @return the count of objects.
*/
public String getObjectsCountInPartition() {
return this.profile.getRootElement().element("partition").attributeValue("count");
}
/**
* Gets a human readable description of the objects in the form of
* {@link C3POProfileParser#DESCRIPTION_OF_SAMPLES_TEMPLATE}
*
* @return
*/
public String getDescriptionOfObjects() {
Element samples = (Element) this.profile.getRootElement().element("partition").element("samples");
String type = samples.attributeValue("type");
return DESCRIPTION_OF_SAMPLES_TEMPLATE.replace("{1}", type);
}
/**
* Gets a human readable text description of the most prominent formats in
* the profile. Traverses the properties of the profile until it finds the
* format distribution. Then it takes the first two most occurring formats
* (if existing). It also appends the percentage of conflicted and unknown
* formats if any.
*
* @return the human readable description.
*/
public String getTypeOfObjects() {
int count = Integer.parseInt(this.getObjectsCountInPartition());
QName name = new QName("format", this.namespace);
List<Element> properties = this.profile.getRootElement().element("partition").element("properties")
.elements("property");
List<Element> items = new ArrayList<Element>();
for (Element e : properties) {
if (e.attributeValue("id").equals("format")) {
items.addAll(e.elements());
break;
}
}
if (items.isEmpty()) {
return MISSING;
}
StringBuffer response = new StringBuffer();
String type;
double tmp;
double percent;
if (items.size() >= 1) {
Element item = items.remove(0);
type = item.attributeValue("id");
tmp = Double.parseDouble(item.attributeValue("value"));
percent = Math.floor((tmp / count) * 100);
response.append(TYPE_OF_OBJECTS_BEGIN.replace("{1}", percent + "").replace("{2}", type));
}
if (items.size() >= 1) {// already removed first
Element item = items.remove(0);
type = item.attributeValue("id");
tmp = Double.parseDouble(item.attributeValue("value"));
percent = Math.floor((tmp / count) * 100);
response.append(TYPE_OF_OBJECTS_SECOND.replace("{1}", percent + "").replace("{2}", type));
}
for (Object o : items) {
Element e = (Element) o;
if (e.attributeValue("id").equals("Conflicted")) {
tmp = Double.parseDouble(e.attributeValue("value"));
percent = Math.floor((tmp / count) * 100);
response.append(TYPE_OF_OBJECTS_CONFLICTS.replace("{1}", percent + ""));
} else if (e.attributeValue("id").equals("Unknown")) {
tmp = Double.parseDouble(e.attributeValue("value"));
percent = Math.floor((tmp / count) * 100);
response.append(TYPE_OF_OBJECTS_UNKNOWN.replace("{1}", percent + ""));
}
}
return response.toString();
}
/**
* Gets a list of the sample objects and their metadata.
*
* @return the list of {@link SampleObject}s
*/
public List<SampleObject> getSampleObjects() {
List<SampleObject> objects = new ArrayList<SampleObject>();
Element samples = this.profile.getRootElement().element("partition").element("samples");
for (Object s : samples.elements()) {
Element sample = (Element) s;
objects.add(this.parseSample(sample));
}
return objects;
}
/**
* Gets a list of proprietary identifiers as specified in the profile.
* Depending on the use of the profile and whether or not the profile is
* associated with a repository these identifiers can be different. The only
* guarantee provided by the profile is that all uids are unique (within a
* profile).
*
* @return a list of repository/file specific unique identifiers of the
* objects in the collection.
*/
public List<String> getObjectIdentifiers() {
List<String> uris = new ArrayList<String>();
List<Element> elements = this.profile.getRootElement().element("partition").element("elements")
.elements("element");
for (Element e : elements) {
uris.add(e.attributeValue("uid"));
}
return uris;
}
private SampleObject parseSample(Element sample) {
String uid = sample.attributeValue("uid");
SampleObject sampleObject = new SampleObject(uid);
sampleObject.setFullname(uid);
List<Element> mimes = new ArrayList<Element>();
List<Element> size = new ArrayList<Element>();
List<Element> records = sample.elements("record");
for (Element rec : records) {
if (rec.attributeValue("name").equals("mimetype")) {
mimes.add(rec);
}
if (rec.attributeValue("name").equals("size")) {
size.add(rec);
}
}
if (mimes.size() > 1) {
sampleObject.setContentType("Conflict");
} else if (mimes.size() == 1) {
Element mimetype = (Element) mimes.get(0);
sampleObject.setContentType(mimetype.attributeValue("value"));
}
if (size.size() == 1) {
Element s = (Element) size.get(0);
sampleObject.setSizeInBytes(Long.parseLong(s.attributeValue("value")));
}
FormatInfo info = this.getFormatInfo(sample, sampleObject.getContentType());
sampleObject.setFormatInfo(info);
return sampleObject;
}
private FormatInfo getFormatInfo(Element sample, String mime) {
FormatInfo info = new FormatInfo();
info.setMimeType(mime);
String uid = sample.attributeValue("uid");
List<Element> records = sample.elements("record");
List<Element> formats = new ArrayList<Element>();
List<Element> versions = new ArrayList<Element>();
List<Element> puids = new ArrayList<Element>();
for (Element rec : records) {
if (rec.attributeValue("name").equals("format")) {
formats.add(rec);
}
if (rec.attributeValue("name").equals("format_version")) {
versions.add(rec);
}
if (rec.attributeValue("name").equals("puid")) {
puids.add(rec);
}
}
if (formats.size() > 1) {
info.setName("Conflict");
} else if (formats.size() == 1) {
Element format = (Element) formats.get(0);
info.setName(format.attributeValue("value"));
}
if (versions.size() > 1) {
info.setVersion("Conflict");
} else if (versions.size() == 1) {
Element version = (Element) versions.get(0);
info.setVersion(version.attributeValue("value"));
}
if (puids.size() > 1) {
info.setPuid("Conflict");
} else if (puids.size() == 1) {
Element puid = (Element) puids.get(0);
info.setPuid(puid.attributeValue("value"));
}
return info;
}
// TODO read in the schema and if it is not of c3po - validate agains
// the current schema of c3po.
private boolean isValid(SAXParser parser, InputStream stream) {
log.debug("validating collection profile");
try {
SimpleErrorHandler errorHandler = new SimpleErrorHandler();
SAXReader reader = new SAXReader(parser.getXMLReader());
reader.setValidation(true);
reader.setErrorHandler(errorHandler);
reader.read(stream);
return errorHandler.isValid();
} catch (SAXException e) {
log.error("SAXException: {}", e.getMessage());
} catch (DocumentException e) {
e.printStackTrace();
log.error("DocumentException: {}", e.getMessage());
} catch (NullPointerException e) {
log.warn("Factory is not initialized. Did you call init()");
}
return false;
}
/**
* A simple error handler to catch if the xml document has some errors.
*
* @author Petar Petrov - <me@petarpetrov.org>
*
*/
private class SimpleErrorHandler implements ErrorHandler {
private boolean valid;
public SimpleErrorHandler() {
this.valid = true;
}
@Override
public void error(SAXParseException e) throws SAXException {
log.error("Error: {}", e.getMessage());
this.valid = false;
}
@Override
public void fatalError(SAXParseException e) throws SAXException {
log.error("Fatal Error: {}", e.getMessage());
this.valid = false;
}
@Override
public void warning(SAXParseException e) throws SAXException {
log.error("Warning: {}", e.getMessage());
}
public boolean isValid() {
return this.valid;
}
}
}