Package net.sf.jpluck.spider

Source Code of net.sf.jpluck.spider.Resource

package net.sf.jpluck.spider;

import net.sf.jpluck.palm.bitmap.Bitmap;
import net.sf.jpluck.plucker.CompositeImageRecord;
import net.sf.jpluck.plucker.ImageRecord;
import net.sf.jpluck.plucker.TextRecord;
import net.sf.jpluck.plucker.parsing.html.TidyParser;

import org.xml.sax.SAXException;
import org.xml.sax.InputSource;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;


public class Resource {
    private ContentType contentType;
    private String redirectionURI;
    private String uri;
    private byte[] data;
    private boolean embedded;
    private int level;

    public Resource(String uri, String redirectionURI, ContentType contentType, byte[] data, int level,
                    boolean embedded) {
        this.uri = uri;
        this.redirectionURI = redirectionURI;
        this.contentType = contentType;
        this.data = data;
        this.level = level;
        this.embedded = embedded;
    }

    public String getCharset() {
        if ((contentType != null) && (contentType.getCharsets().length > 0)) {
            return contentType.getCharsets()[0];
        } else {
            return null;
        }
    }

    public ContentType getContentType() {
        return contentType;
    }

    public byte[] getData() {
        return data;
    }

    public boolean isEmbedded() {
        return embedded;
    }

    public int getLevel() {
        return level;
    }

    public String getMimeType() {
        return contentType.getMimeType();
    }

    public boolean isRedirected() {
        return (redirectionURI != null);
    }

    public String getRedirectionURI() {
        return redirectionURI;
    }

    public String getURI() {
        return uri;
    }

    public CompositeImageRecord createCompositeImageRecord(String[] segmentURIs, int rows, boolean alternate) {
        String a = (alternate ? ".alternate" : "");
        String s = (isRedirected() ? redirectionURI : uri) + a;
        CompositeImageRecord imageRecord = new CompositeImageRecord(s, segmentURIs, rows);
        if (isRedirected()) {
            imageRecord.setAlternateURI(uri + a);
        }
        return imageRecord;
    }

    public ImageRecord createImageRecord(Bitmap bitmap, boolean alternate) {
        String a = (alternate ? ".alternate" : "");
        String s = (isRedirected() ? redirectionURI : uri) + a;
        ImageRecord imageRecord = new ImageRecord(s, bitmap);
        if (isRedirected()) {
            imageRecord.setAlternateURI(uri + a);
        }
        return imageRecord;
    }

    public TextRecord createTextRecord(String encoding, boolean hires) {
        TextRecord textRecord = new TextRecord((isRedirected() ? redirectionURI : uri), encoding, hires);
        if (isRedirected()) {
            textRecord.setAlternateURI(uri);
        }
        return textRecord;
    }

    public org.w3c.dom.Document parseXML() throws SAXException {
        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            dbf.setNamespaceAware(true);
      DocumentBuilder db = dbf.newDocumentBuilder();
      InputSource inputSource = new InputSource(new ByteArrayInputStream(data));
      inputSource.setSystemId((isRedirected() ? redirectionURI : uri));
            return db.parse(inputSource);
        } catch (IOException e) {
            // Should not occur
            throw new RuntimeException(e);
        } catch (ParserConfigurationException e) {
            // Should not occur
            throw new RuntimeException(e);
        }
    }

    public org.w3c.dom.Document parseHTML() {
        byte[] data = this.data;
        String charset = getCharset();
        if (charset == null) {
            charset = "Windows-1252";
        }

        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        int start = 0;
        for (int i = 0; i < (data.length - 2); i++) {
            int c = (int) data[i] & 0xff;
            int d = (int) data[i + 1] & 0xff;
            if ((c == '&') && (d == '#')) {
                //baos.write(data, start, i - start);
                // Numeric entity
                String s = "";
                int pos = i + 2;
                while (pos < (i + 7)) {
                    int e = (int) data[pos] & 0xff;
                    if (e == ';') {
                        pos += 1;
                    }
                    if ((e >= '0') && (e <= '9')) {
                        s += (char) e;
                    } else {
                        break;
                    }
                    pos++;
                }
                if (s.length() > 0) {
                    int value = Integer.parseInt(s);
                    if (value < 255) {
                        baos.write(data, start, i - start);
                        baos.write(value);
                        start = pos;
                        i = pos - 1; // Possible fix for numeric entity bug
                    }
                }
            }
        }
        baos.write(data, start, data.length - start);
        data = baos.toByteArray();
        if (!charset.equalsIgnoreCase("UTF-8")) {
            ByteBuffer input = ByteBuffer.wrap(data);
            Charset cs = Charset.forName(charset);
            CharBuffer cb = cs.decode(input);
            Charset utf8 = Charset.forName("UTF-8");
            ByteBuffer output = utf8.encode(cb);
            data = output.array();
        }
        return TidyParser.parse(new ByteArrayInputStream(data));
    }

    void scanForContentType() {
        try {
            if (!contentType.getMimeType().equals("text/html")) {
                return;
            }

            String s = new String(data, "ISO-8859-1").toLowerCase();
            Pattern pattern = Pattern.compile("<meta .*http-equiv=\"content-type.*\".*>");
            Matcher matcher = pattern.matcher(s);
            while (matcher.find()) {
                String meta = matcher.group();
                int start = meta.indexOf("content=\"");
                if (start != -1) {
                    start += 9;

                    int end = meta.indexOf('\"', start);
                    String content = meta.substring(start, end);
                    ContentType contentType = new ContentType(content);
                    if (contentType.getCharsets().length > 0) {
                        this.contentType = contentType;
                    }
                    break;
                }
            }
        } catch (UnsupportedEncodingException e) {
        }
    }
}
TOP

Related Classes of net.sf.jpluck.spider.Resource

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.