package net.sf.jpluck.spider;
import net.sf.jpluck.palm.bitmap.Bitmap;
import net.sf.jpluck.plucker.CompositeImageRecord;
import net.sf.jpluck.plucker.ImageRecord;
import net.sf.jpluck.plucker.TextRecord;
import net.sf.jpluck.plucker.parsing.html.TidyParser;
import org.xml.sax.SAXException;
import org.xml.sax.InputSource;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
public class Resource {
private ContentType contentType;
private String redirectionURI;
private String uri;
private byte[] data;
private boolean embedded;
private int level;
public Resource(String uri, String redirectionURI, ContentType contentType, byte[] data, int level,
boolean embedded) {
this.uri = uri;
this.redirectionURI = redirectionURI;
this.contentType = contentType;
this.data = data;
this.level = level;
this.embedded = embedded;
}
public String getCharset() {
if ((contentType != null) && (contentType.getCharsets().length > 0)) {
return contentType.getCharsets()[0];
} else {
return null;
}
}
public ContentType getContentType() {
return contentType;
}
public byte[] getData() {
return data;
}
public boolean isEmbedded() {
return embedded;
}
public int getLevel() {
return level;
}
public String getMimeType() {
return contentType.getMimeType();
}
public boolean isRedirected() {
return (redirectionURI != null);
}
public String getRedirectionURI() {
return redirectionURI;
}
public String getURI() {
return uri;
}
public CompositeImageRecord createCompositeImageRecord(String[] segmentURIs, int rows, boolean alternate) {
String a = (alternate ? ".alternate" : "");
String s = (isRedirected() ? redirectionURI : uri) + a;
CompositeImageRecord imageRecord = new CompositeImageRecord(s, segmentURIs, rows);
if (isRedirected()) {
imageRecord.setAlternateURI(uri + a);
}
return imageRecord;
}
public ImageRecord createImageRecord(Bitmap bitmap, boolean alternate) {
String a = (alternate ? ".alternate" : "");
String s = (isRedirected() ? redirectionURI : uri) + a;
ImageRecord imageRecord = new ImageRecord(s, bitmap);
if (isRedirected()) {
imageRecord.setAlternateURI(uri + a);
}
return imageRecord;
}
public TextRecord createTextRecord(String encoding, boolean hires) {
TextRecord textRecord = new TextRecord((isRedirected() ? redirectionURI : uri), encoding, hires);
if (isRedirected()) {
textRecord.setAlternateURI(uri);
}
return textRecord;
}
public org.w3c.dom.Document parseXML() throws SAXException {
try {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
DocumentBuilder db = dbf.newDocumentBuilder();
InputSource inputSource = new InputSource(new ByteArrayInputStream(data));
inputSource.setSystemId((isRedirected() ? redirectionURI : uri));
return db.parse(inputSource);
} catch (IOException e) {
// Should not occur
throw new RuntimeException(e);
} catch (ParserConfigurationException e) {
// Should not occur
throw new RuntimeException(e);
}
}
public org.w3c.dom.Document parseHTML() {
byte[] data = this.data;
String charset = getCharset();
if (charset == null) {
charset = "Windows-1252";
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
int start = 0;
for (int i = 0; i < (data.length - 2); i++) {
int c = (int) data[i] & 0xff;
int d = (int) data[i + 1] & 0xff;
if ((c == '&') && (d == '#')) {
//baos.write(data, start, i - start);
// Numeric entity
String s = "";
int pos = i + 2;
while (pos < (i + 7)) {
int e = (int) data[pos] & 0xff;
if (e == ';') {
pos += 1;
}
if ((e >= '0') && (e <= '9')) {
s += (char) e;
} else {
break;
}
pos++;
}
if (s.length() > 0) {
int value = Integer.parseInt(s);
if (value < 255) {
baos.write(data, start, i - start);
baos.write(value);
start = pos;
i = pos - 1; // Possible fix for numeric entity bug
}
}
}
}
baos.write(data, start, data.length - start);
data = baos.toByteArray();
if (!charset.equalsIgnoreCase("UTF-8")) {
ByteBuffer input = ByteBuffer.wrap(data);
Charset cs = Charset.forName(charset);
CharBuffer cb = cs.decode(input);
Charset utf8 = Charset.forName("UTF-8");
ByteBuffer output = utf8.encode(cb);
data = output.array();
}
return TidyParser.parse(new ByteArrayInputStream(data));
}
void scanForContentType() {
try {
if (!contentType.getMimeType().equals("text/html")) {
return;
}
String s = new String(data, "ISO-8859-1").toLowerCase();
Pattern pattern = Pattern.compile("<meta .*http-equiv=\"content-type.*\".*>");
Matcher matcher = pattern.matcher(s);
while (matcher.find()) {
String meta = matcher.group();
int start = meta.indexOf("content=\"");
if (start != -1) {
start += 9;
int end = meta.indexOf('\"', start);
String content = meta.substring(start, end);
ContentType contentType = new ContentType(content);
if (contentType.getCharsets().length > 0) {
this.contentType = contentType;
}
break;
}
}
} catch (UnsupportedEncodingException e) {
}
}
}