Package org.jab.docsearch.utils

Examples of org.jab.docsearch.utils.WebPageMetaData


                }

                // use the correct data extractor
                switch (fileType) {
                    case HTML: {
                        WebPageMetaData wpmd = getWebPageMetaData(currentFi);

                        curTitle = wpmd.getTitle();
                        curSummary = wpmd.getDescription();
                        author = wpmd.getAuthor();
                        is = new FileInputStream(ds.htmlTextFile);
                        break;
                    }
                    case TEXT: {
                        curTitle = getTextTitle(currentFi);
View Full Code Here


     *
     * @param filename  filename of webpage
     * @return author, title, and summary of a web page speficied in filename
     */
    public WebPageMetaData getWebPageMetaData(String filename) {
        WebPageMetaData tempWpmd = new WebPageMetaData();
        tempWpmd.setFilename(filename);

        BufferedReader reader = null;
        PrintWriter writer = null;
        try {
            boolean inTag = false;
            boolean foundSummary = false;
            boolean inBody = false;
            boolean inScript = false;
            boolean inTitle = false;

            StringBuffer tagBuf = new StringBuffer();
            StringBuffer titleBuf = new StringBuffer();
            StringBuffer summaryBuf = new StringBuffer();

            // open reader and writer
            File origFile = new File(filename);
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(origFile)));
            writer = new PrintWriter(new FileWriter(ds.htmlTextFile));

            StringBuffer nonTagTextf = new StringBuffer();
            int curBodyNonTagTextNum = 0;
            int sumMaxSize = 220;
            int ch;

            // step during html source
            while ((ch = reader.read()) > -1) {
                char curChar = (char) ch;

                if (curChar == '>') {
                    inTag = false;
                    //
                    tagBuf.append(curChar);
                    String realTag = tagBuf.toString();
                    String lowerTag = realTag.toLowerCase();
                    if (lowerTag.startsWith(META_TAG)) {
                        String tempMetaName = Utils.getTagString("name=", lowerTag);
                        if (tempMetaName.startsWith("description")) {
                            String tempMetaContent = Utils.getTagString("content=", realTag);
                            if (! tempMetaContent.trim().equals("")) {
                                tempWpmd.setDescription(tempMetaContent);
                                foundSummary = true;
                            }
                        }
                        else if (tempMetaName.startsWith("summary")) {
                            String tempMetaContent = Utils.getTagString("content=", realTag);
                            if (! tempMetaContent.trim().equals("")) {
                                tempWpmd.setDescription(tempMetaContent);
                                foundSummary = true;
                            }
                        }
                        else if (tempMetaName.startsWith("author") || tempMetaName.indexOf("webmaster") != -1) {
                            String tempMetaContent = Utils.getTagString("content=", realTag);
                            tempWpmd.setAuthor(tempMetaContent);
                        }
                    }
                    else if (lowerTag.startsWith(SCRIPT_TAG)) {
                        if (!lowerTag.endsWith("/>")) {
                            inScript = true;
                        }
                    }
                    else if (lowerTag.startsWith(SCRIPT_TAG_END)) {
                        inScript = false;
                    }
                    else if (lowerTag.startsWith(BODY_TAG)) {
                        inBody = true;
                    }
                    else if (lowerTag.startsWith(BODY_TAG_END)) {
                        inBody = false;
                    }
                    else if (lowerTag.startsWith(TITLE_TAG)) {
                        inTitle = true;
                    }
                    else if (lowerTag.startsWith(TITLE_TAG_END)) {
                        inTitle = false;
                        tempWpmd.setTitle(titleBuf.toString());
                    }
                    // reset our buffers
                    tagBuf = new StringBuffer();
                    nonTagTextf = new StringBuffer();
                }
                else if (curChar == '<') {
                    inTag = true;
                    tagBuf = new StringBuffer();
                    String t = nonTagTextf.toString().trim();
                    int tSize = t.length();
                    if (tSize > 0) {
                        if (! inScript && inBody) {
                            writer.println(t);
                        }
                    }
                    nonTagTextf = new StringBuffer();
                    //
                    if (! foundSummary) {
                        //
                        if (inBody) {
                            curBodyNonTagTextNum += tSize;
                            summaryBuf.append(' ');
                            summaryBuf.append(t);
                            summaryBuf.append(' ');
                            if (curBodyNonTagTextNum >= sumMaxSize) {
                                tempWpmd.setDescription(Utils.concatStrToEnd(summaryBuf.toString(), sumMaxSize));
                                foundSummary = true;
                            }
                        }
                    }
                    //
                } // end for the beginning of a tag
                if (inTitle && curChar != '>' && ! inTag) {
                    titleBuf.append(curChar);
                }
                else if (! inTag && curChar != '>') {
                    nonTagTextf.append(curChar);
                }
                else if (inTag) {
                    tagBuf.append(curChar);
                }
            } // end for while reading

            if (! foundSummary && curBodyNonTagTextNum > 0) {
                tempWpmd.setDescription(summaryBuf.toString());
            }
        }
        catch (IOException ioe) {
            logger.error("getWebPageMetaData() failed", ioe);
            ds.setStatus(I18n.getString("error") + " : " + ioe.toString());
View Full Code Here

TOP

Related Classes of org.jab.docsearch.utils.WebPageMetaData

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.