Package com.digitalpebble.behemoth

Examples of com.digitalpebble.behemoth.BehemothDocument


    public void map(LongWritable key, WritableWarcRecord record,
            OutputCollector<Text, BehemothDocument> output, Reporter reporter)
            throws IOException {

        BehemothDocument behemothDocument = new BehemothDocument();

        if (record.getRecord().getHeaderRecordType().equals("response") == false)
            return;

        byte[] binarycontent = record.getRecord().getContent();

        String uri = record.getRecord()
                .getHeaderMetadataItem("WARC-Target-URI");
        // application/http;msgtype=response
        // but always null?
        // String WARCContentType =
        // record.getRecord().getHeaderMetadataItem("Content-Type");

        HttpResponse response;
        try {
            response = new HttpResponse(binarycontent);
        } catch (ProtocolException e) {
            return;
        }

        behemothDocument.setUrl(uri);
        newKey.set(uri);

        String contentType = response.getHeader(HttpHeaders.CONTENT_TYPE);
        behemothDocument.setContentType(contentType);
        behemothDocument.setContent(response.getContent());

        output.collect(newKey, behemothDocument);
    }
View Full Code Here


    public void map(Text key, Content content,
            OutputCollector<Text, BehemothDocument> output, Reporter reporter)
            throws IOException {

        BehemothDocument behemothDocument = new BehemothDocument();

        int status = Integer.parseInt(content.getMetadata().get(
                Nutch.FETCH_STATUS_KEY));
        if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
            // content not fetched successfully, skip document
            LOG.debug("Skipping " + key
                    + " as content is not fetched successfully");
            return;
        }

        // TODO store the fetch metadata in the Behemoth document
        // store the binary content and mimetype in the Behemoth document

        String contentType = content.getContentType();
        byte[] binarycontent = content.getContent();
        behemothDocument.setUrl(key.toString());
        behemothDocument.setContent(binarycontent);
        behemothDocument.setContentType(contentType);
        output.collect(key, behemothDocument);
    }
View Full Code Here

        Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(),
                input);
        for (Reader current : cacheReaders) {
            // read the key + values in that file
            Text key = new Text();
            BehemothDocument inputDoc = new BehemothDocument();
            BufferedWriter writer = null;
            gate.Document gatedocument = null;
            while (current.next(key, inputDoc)) {
                count[0]++;
                // generate a GATE document then save it to XML
View Full Code Here

        Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(),
                input);
        for (Reader current : cacheReaders) {
            // read the key + values in that file
            Text key = new Text();
            BehemothDocument inputDoc = new BehemothDocument();
            while (current.next(key, inputDoc)) {
                count[0]++;
                // filter the doc?
                if (!docFilter.keep(inputDoc))
                    continue;
                if (dumpBinary && inputDoc.getContent() == null)
                    continue;
                else if (!dumpBinary && inputDoc.getText() == null)
                    continue;

                String fileName = Integer.toString(count[0]);
                String urldoc = inputDoc.getUrl();
                if (mode.equals(FileNamingMode.URL) && urldoc != null
                        && urldoc.length() > 0) {
                    fileName = URLEncoder.encode(urldoc, "UTF-8");
                } else if (mode.equals(FileNamingMode.UUID) && urldoc != null
                        && urldoc.length() > 0) {
                    fileName = UUID.nameUUIDFromBytes(urldoc.getBytes())
                            .toString();
                } else {
                    fileName = String.format("%09d", count[0]);
                }

                if (!dumpBinary)
                    fileName += ".txt";

                byte[] contentBytes;
                if (dumpBinary)
                    contentBytes = inputDoc.getContent();
                else
                    contentBytes = inputDoc.getText().getBytes("UTF-8");
                // out.write(contentBytes, 0, contentBytes.length);
                addToArchive(fileName, contentBytes, dir);

                // add the mapping URL->filename in the index -> archive num
                index.writeBytes(urldoc + "\t" + fileName + "\t"
View Full Code Here

    public long generate(boolean recurse) throws IOException {
        long result = 0;
        // read from input path
        // create new Content object and add it to the SequenceFile
        Text key = new Text();
        BehemothDocument value = new BehemothDocument();
        SequenceFile.Writer writer = null;
        try {
            Configuration conf = getConf();
            FileSystem fs = output.getFileSystem(conf);
            writer = SequenceFile.createWriter(fs, conf, output,
                    key.getClass(), value.getClass());
            PerformanceFileFilter pff = new PerformanceFileFilter(writer, key,
                    value, conf, reporter);
            // iterate on the files in the source dir
            result = processFiles(conf, input, recurse, pff);
View Full Code Here

            if (!path.getName().startsWith("part-")
                    && !path.getName().equals(inputPath.getName()))
                continue;
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
            Text key = new Text();
            BehemothDocument value = new BehemothDocument();
            while (reader.next(key, value)) {
                // skip this document?
                if (doFilter && filters.keep(value) == false)
                    continue;

                System.out.println(value.toString(showBinaryContent,
                        showAnnotations, showText, showMD));
            }
            reader.close();
        }
View Full Code Here

        Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(),
                input);
        for (Reader current : cacheReaders) {
            // read the key + values in that file
            Text key = new Text();
            BehemothDocument inputDoc = new BehemothDocument();
            BufferedWriter writer = null;
            gate.Document gatedocument = null;
            while (current.next(key, inputDoc)) {
                count[0]++;
                // generate a GATE document then save it to XML
View Full Code Here

        Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(),
                input);
        for (Reader current : cacheReaders) {
            // read the key + values in that file
            Text key = new Text();
            BehemothDocument inputDoc = new BehemothDocument();
            while (current.next(key, inputDoc)) {
                count[0]++;
                // filter the doc?
                if (!docFilter.keep(inputDoc))
                    continue;
                if (dumpBinary && inputDoc.getContent() == null)
                    continue;
                else if (!dumpBinary && inputDoc.getText() == null)
                    continue;

                String fileName = Integer.toString(count[0]);
                String urldoc = inputDoc.getUrl();
                if (mode.equals(FileNamingMode.URL) && urldoc != null
                        && urldoc.length() > 0) {
                    fileName = URLEncoder.encode(urldoc, "UTF-8");
                } else if (mode.equals(FileNamingMode.UUID) && urldoc != null
                        && urldoc.length() > 0) {
                    fileName = UUID.nameUUIDFromBytes(urldoc.getBytes())
                            .toString();
                } else {
                    fileName = String.format("%09d", count[0]);
                }

                if (!dumpBinary)
                    fileName += ".txt";

                byte[] contentBytes;
                if (dumpBinary)
                    contentBytes = inputDoc.getContent();
                else
                    contentBytes = inputDoc.getText().getBytes("UTF-8");
                // out.write(contentBytes, 0, contentBytes.length);
                addToArchive(fileName, contentBytes, dir);

                // add the mapping URL->filename in the index -> archive num
                index.writeBytes(urldoc + "\t" + fileName + "\t"
View Full Code Here

            if (!path.getName().startsWith("part-")
                    && !path.getName().equals(inputPath.getName()))
                continue;
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
            Text key = new Text();
            BehemothDocument value = new BehemothDocument();
            while (reader.next(key, value)) {
                // skip this document?
                if (doFilter && filters.keep(value) == false)
                    continue;

                System.out.println(value.toString(showBinaryContent,
                        showAnnotations, showText, showMD));
            }
            reader.close();
        }
View Full Code Here

public class SequenceFileConverterMapper extends
        Mapper<Writable, Writable, Text, BehemothDocument> {
    @Override
    protected void map(Writable key, Writable value, Context context)
            throws IOException, InterruptedException {
        BehemothDocument doc = new BehemothDocument();
        doc.setUrl(key.toString());
        // TODO: Is this the right way to do this? We need the bytes.
        DataOutputBuffer out = new DataOutputBuffer();
        value.write(out);
        doc.setContent(out.getData());
        // doc.setContent(value.toString().getBytes(Charset.forName("UTF-8")));
        context.write(new Text(key.toString()), doc);
    }
View Full Code Here

TOP

Related Classes of com.digitalpebble.behemoth.BehemothDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.