Examples of net.yacy.document.Document

net.yacy.document.Document

        final StringBuilder sb = new StringBuilder();
        for (final String[] row: table) {
            sb.append(concatRow(row)).append(' ');
        }
        try {
            return new Document[]{new Document(
                    location,
                    mimeType,
                    charset,
                    this,
                    null,

View Full Code Here

            
            /*
             * create the plasmaParserDocument for the database
             * and set shortText and bodyText properly
             */
            final Document[] docs = new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    null,

View Full Code Here

            if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
            
            // create the parser document
            Document[] docs = null;
            final byte[] contentBytes = UTF8.getBytes(writer.toString());
            docs = new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    languages,

View Full Code Here

            final RTFEditorKit theRtfEditorKit = new RTFEditorKit();               
            theRtfEditorKit.read(source, doc, 0);            
            
            final String bodyText = doc.getText(0, doc.getLength());
            
            return new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    null,

View Full Code Here

     * all extracted information about the parsed document
     */
    public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source)
            throws Parser.Failure, InterruptedException {


      Document theDoc = null;
      
        try {
            String contents = "";
            SummaryInformation summary = null;
            try {
                final VisioTextExtractor extractor = new VisioTextExtractor(source);
              contents = extractor.getText();
                summary = extractor.getSummaryInformation();
            } catch (Exception e) {
              Log.logWarning("vsdParser", e.getMessage());
            }


            String author = null;
            String[] keywords = null;
            String title = null;
            if (summary != null) {
                author = summary.getAuthor();
                if (summary.getKeywords() != null) {
                    keywords = summary.getKeywords().split("[ ,;]");
                }
                title = summary.getTitle();
            }


            String abstrct = null;
            abstrct = ((contents.length() > 80)? contents.substring(0, 80) : contents.trim()).
                          replaceAll("\r\n"," ").
                          replaceAll("\n"," ").
                          replaceAll("\r"," ").
                          replaceAll("\t"," ");
            
            if (title == null) {
                title = abstrct;
            }


           // As the result of parsing this function must return a plasmaParserDocument object
            return new Document[]{new Document(
                    location,     // url of the source document
                    mimeType,     // the documents mime type
                    "UTF-8",      // charset of the document text
                    this,
                    null,         // language

View Full Code Here

    
    return phrases;
  }


  public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap<String, YMarkTag> tags) {
    final Document document = loadDocument(url, loader);
    if (document != null)
      return autoTag(document, max, tags);
    else
      return "/IOExceptions";
  }

View Full Code Here

                anchors.put(new MultiProtocolURI(url), p);
                contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
            }


           // As the result of parsing this function must return a plasmaParserDocument object
            return new Document[]{new Document(
                    location,     // url of the source document
                    mimeType,     // the documents mime type
                    "UTF-8",      // charset of the document text
                    this,
                    null,

View Full Code Here

            if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
            
            // create the parser document
            Document[] docs = null;
            final byte[] contentBytes = UTF8.getBytes(writer.toString());
            docs = new Document[]{new Document(
                    location,
                    mimeType,
                    "UTF-8",
                    this,
                    languages,

View Full Code Here

        // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
        // the great number of these objects can easily be seen in Java Visual VM
        // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
        COSName.clearResources();
        PDFont.clearResources();
        return new Document[]{new Document(
                location,
                mimeType,
                "UTF-8",
                this,
                null,

View Full Code Here

                System.out.println(pdfFile.getAbsolutePath());
                final long startTime = System.currentTimeMillis();


                // parse
                final AbstractParser parser = new pdfParser();
                Document document = null;
                try {
                    document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new FileInputStream(pdfFile)));
                } catch (final Parser.Failure e) {
                    System.err.println("Cannot parse file " + pdfFile.getAbsolutePath());
                    Log.logException(e);
                } catch (final InterruptedException e) {
                    System.err.println("Interrupted while parsing!");
                    Log.logException(e);
                } catch (final NoClassDefFoundError e) {
                    System.err.println("class not found: " + e.getMessage());
                } catch (final FileNotFoundException e) {
                    Log.logException(e);
                }


                // statistics
                System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms");


                // output
                if (document == null) {
                    System.out.println("\t!!!Parsing without result!!!");
                } else {
                    System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
                    try {
                        // write file
                        FileUtils.copy(document.getText(), new File("parsedPdf.txt"));
                    } catch (final IOException e) {
                        System.err.println("error saving parsed document");
                        Log.logException(e);
                    }
                }

View Full Code Here

0 1 2 3 4 5 6

TOP

Related Classes of net.yacy.document.Document

Bookmarks

de.anomic.data.ymark.YMarkAutoTagger

de.anomic.data.ymark.YMarkTables

de.anomic.search.DocumentIndex

de.anomic.search.MediaSnippet

de.anomic.search.Segment

de.anomic.search.Switchboard

de.anomic.search.TextSnippet

get_metadata

get_treeview

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.