Package com.digitalpebble.behemoth

Examples of com.digitalpebble.behemoth.DocumentFilter


    }

    private void generateDocs(Path input, Path dir, int[] count)
            throws IOException, ArchiveException {

        DocumentFilter docFilter = DocumentFilter.getFilters(getConf());

        Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(),
                input);
        for (Reader current : cacheReaders) {
            // read the key + values in that file
            Text key = new Text();
            BehemothDocument inputDoc = new BehemothDocument();
            while (current.next(key, inputDoc)) {
                count[0]++;
                // filter the doc?
                if (!docFilter.keep(inputDoc))
                    continue;
                if (dumpBinary && inputDoc.getContent() == null)
                    continue;
                else if (!dumpBinary && inputDoc.getText() == null)
                    continue;
View Full Code Here


        Configuration conf = getConf();
        FileSystem fs = FileSystem.get(conf);

        // filter input
        DocumentFilter filters = DocumentFilter.getFilters(conf);
        boolean doFilter = DocumentFilter.isRequired(conf);

        FileStatus[] fss = fs.listStatus(inputPath);
        for (FileStatus status : fss) {
            Path path = status.getPath();
            // skips the _log or _SUCCESS files
            if (!path.getName().startsWith("part-")
                    && !path.getName().equals(inputPath.getName()))
                continue;
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
            Text key = new Text();
            BehemothDocument value = new BehemothDocument();
            while (reader.next(key, value)) {
                // skip this document?
                if (doFilter && filters.keep(value) == false)
                    continue;

                System.out.println(value.toString(showBinaryContent,
                        showAnnotations, showText, showMD));
            }
View Full Code Here

    }

    private void generateDocs(Path input, Path dir, int[] count)
            throws IOException, ArchiveException {

        DocumentFilter docFilter = DocumentFilter.getFilters(getConf());

        Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(),
                input);
        for (Reader current : cacheReaders) {
            // read the key + values in that file
            Text key = new Text();
            BehemothDocument inputDoc = new BehemothDocument();
            while (current.next(key, inputDoc)) {
                count[0]++;
                // filter the doc?
                if (!docFilter.keep(inputDoc))
                    continue;
                if (dumpBinary && inputDoc.getContent() == null)
                    continue;
                else if (!dumpBinary && inputDoc.getText() == null)
                    continue;
View Full Code Here

        Configuration conf = getConf();
        FileSystem fs = inputPath.getFileSystem(conf);

        // filter input
        DocumentFilter filters = DocumentFilter.getFilters(conf);
        boolean doFilter = DocumentFilter.isRequired(conf);

        FileStatus[] fss = fs.listStatus(inputPath);
        for (FileStatus status : fss) {
            Path path = status.getPath();
            // skips the _log or _SUCCESS files
            if (!path.getName().startsWith("part-")
                    && !path.getName().equals(inputPath.getName()))
                continue;
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
            Text key = new Text();
            BehemothDocument value = new BehemothDocument();
            while (reader.next(key, value)) {
                // skip this document?
                if (doFilter && filters.keep(value) == false)
                    continue;

                System.out.println(value.toString(showBinaryContent,
                        showAnnotations, showText, showMD));
            }
View Full Code Here

TOP

Related Classes of com.digitalpebble.behemoth.DocumentFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.