Package net.sf.jpluck.handlers

Source Code of net.sf.jpluck.handlers.PlainTextHandler

package net.sf.jpluck.handlers;

import net.sf.jpluck.plucker.Paragraph;
import net.sf.jpluck.plucker.TextRecord;
import net.sf.jpluck.spider.Resource;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;


public class PlainTextHandler extends ContentHandler {
  public PlainTextHandler(net.sf.jpluck.plucker.Document pluckerDocument, net.sf.jpluck.jxl.Document jxlDocument,
              Resource resource) {
    super(pluckerDocument, jxlDocument, resource);
  }

  public void handle() throws HandlingException {
    try {
      TextRecord textRecord = new TextRecord(resource.getURI(), jxlDocument.getOutputEncoding(),
                           jxlDocument.isUseHiresMargins());
      String charset = resource.getCharset();
      if (charset == null) {
        charset = "ISO-8859-1";
      }
      LineNumberReader rdr = new LineNumberReader(new InputStreamReader(new ByteArrayInputStream(resource.getData()),
                                        charset));
      Paragraph paragraph = textRecord.addParagraph(Paragraph.DEFAULT_SPACING);
      for (String line; (line = rdr.readLine()) != null;) {
        if (line.length() == 0) {
          paragraph = textRecord.addParagraph(Paragraph.DEFAULT_SPACING);
        } else {
          int start = line.indexOf("http://");
          if (start > -1) {
            int end = line.indexOf(' ', start);
            if (end == -1) {
              end = line.length();
            }

            String url = line.substring(start, end);
            paragraph.addLinkStart(url);
            paragraph.addPreformattedText(url);
            paragraph.addLinkEnd();
          } else {
            paragraph.addPreformattedText(line);
          }
          paragraph.addNewline();
        }
      }
      pluckerDocument.addRecord(textRecord);
    } catch (IOException e) {
      throw new HandlingException(e);
    }
  }
}
TOP

Related Classes of net.sf.jpluck.handlers.PlainTextHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.