Package org.archive.util.iterator

Examples of org.archive.util.iterator.RegexLineIterator


     * another thread to proceed after a configurable number of lines processed
     */
    protected void announceSeedsFromReader(BufferedReader reader, CountDownLatch latchOrNull) {
        String s;
        Iterator<String> iter =
            new RegexLineIterator(
                    new LineReadingIterator(reader),
                    RegexLineIterator.COMMENT_LINE,
                    RegexLineIterator.NONWHITESPACE_ENTRY_TRAILING_COMMENT,
                    RegexLineIterator.ENTRY);

        int count = 0;
        while (iter.hasNext()) {
            s = (String) iter.next();
            if(Character.isLetterOrDigit(s.charAt(0))) {
                // consider a likely URI
                seedLine(s);
                count++;
                if(count%20000==0) {
View Full Code Here


        boolean asSeeds = !params.isNull("asSeeds");
        boolean scopeScheduleds = !params.isNull("scopeScheduleds");
        DecideRule scope = scopeScheduleds ? getScope() : null;
        try {
            br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
            Iterator<String> iter = new RegexLineIterator(new LineReadingIterator(br),
                RegexLineIterator.COMMENT_LINE, extractor, output);
            while(iter.hasNext()) {
                try {
                   
                    CrawlURI curi = CrawlURI.fromHopsViaString(((String)iter.next()));
                    curi.setForceFetch(forceRevisit);
                    if (asSeeds) {
                        curi.setSeed(asSeeds);
                        if (curi.getVia() == null || curi.getVia().length() <= 0) {
                            // Danger of double-add of seeds because of this code here.
View Full Code Here

            URLConnection conn = (new URL(uri)).openConnection();
            reader = new InputStreamReader(conn.getInputStream());
        }
        reader = new BufferedReader(reader);
        Iterator<String> iter =
            new RegexLineIterator(
                    new LineReadingIterator((BufferedReader) reader),
                    RegexLineIterator.COMMENT_LINE,
                    RegexLineIterator.TRIMMED_ENTRY_TRAILING_COMMENT,
                    RegexLineIterator.ENTRY);
        while (iter.hasNext()) {
            String[] entry = ((String) iter.next()).split("\\s+");
            map.put(entry[0],entry[1]);
        }
        reader.close();
    }
View Full Code Here

TOP

Related Classes of org.archive.util.iterator.RegexLineIterator

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.