Examples of TupleEntryCollector


Examples of cascading.tuple.TupleEntryCollector

        BasePath inputPath = platform.makePath("build/test/ParserPipeTest/in");
        Tap in = platform.makeTap(platform.makeBinaryScheme(FetchedDatum.FIELDS), inputPath);
        BasePath outputPath = platform.makePath("build/test/ParserPipeTest/out");
        Tap out = platform.makeTap(platform.makeBinaryScheme(ParsedDatum.FIELDS), outputPath, SinkMode.REPLACE);

        TupleEntryCollector write = in.openForWrite(platform.makeFlowProcess());

        ArchiveReader archiveReader = ArchiveReaderFactory.get("src/test/resources/someHtml.arc");
        Iterator<ArchiveRecord> iterator = archiveReader.iterator();
        int max = 300;
        int count = 0;
        int validRecords = 0;
        while (count++ < max && iterator.hasNext()) {
            ArchiveRecord archiveRecord = iterator.next();
            ArchiveRecordHeader header = archiveRecord.getHeader();
            String url = header.getUrl();

            String protocol = "";
            try {
                protocol = new URL(url).getProtocol();
            } catch (MalformedURLException e) {
                // Ignore and skip
            }

            if (protocol.equals("http")) {
                validRecords += 1;
                int contentOffset = header.getContentBegin();
                long totalLength = header.getLength();
                int contentLength = (int) totalLength - contentOffset;

                archiveRecord.skip(contentOffset);
                byte[] content = new byte[contentLength];
                archiveRecord.read(content);

                String mimetype = header.getMimetype();
                // The Arc headers != HTTP headers, but it's at least some data we can jam
                // into the FetchedDatum as a test. Note that the Arc headers will have value
                // types other than a long, so we have do to the conversion.
                HttpHeaders headers = new HttpHeaders();
                Set<String> keys = header.getHeaderFieldKeys();
                for (String key : keys) {
                    String value = header.getHeaderValue(key).toString();
                    headers.add(key, value);
                }
               
                FetchedDatum contentTuple = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(content), mimetype, 0);
                write.add(contentTuple.getTuple());
            }
        }

        write.close();
        FlowConnector flowConnector = platform.makeFlowConnector();
        Flow flow = flowConnector.connect(in, out, parserPipe);
        flow.complete();
       
        // Currently many of the docs fail parsing:
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

    public static void importSeedUrls(BasePlatform platform, BasePath crawlDbPath, String fileName) throws Exception  {
       
        SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();
       
        InputStream is = null;
        TupleEntryCollector writer = null;
        try {
            Tap urlSink = platform.makeTap(platform.makeTextScheme(), crawlDbPath, SinkMode.REPLACE);
            writer = urlSink.openForWrite(platform.makeFlowProcess());

            is = DemoWebMiningWorkflow.class.getResourceAsStream(fileName);
            if (is == null) {
                throw new FileNotFoundException("The seed urls file doesn't exist");
            }

            List<String> lines = IOUtils.readLines(is);
            for (String line : lines) {
                line = line.trim();
                if (line.startsWith("#")) {
                    continue;
                }

                CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f, 0.0f);
                writer.add(datum.getTuple());
            }

        } catch (IOException e) {
            crawlDbPath.delete(true);
            throw e;
        } finally {
            IoUtils.safeClose(is);
            if (writer != null) {
                writer.close();
            }
        }

    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

            crawlDelay = _policy.getDefaultCrawlDelay();
        }
       
        _policy.startFetchSet(key, crawlDelay);
       
        TupleEntryCollector collector = buffCall.getOutputCollector();

        PartitioningKey newKey = new PartitioningKey(key, _numReduceTasks);
       
        while (safeHasNext()) {
            ScoredUrlDatum scoredDatum = new ScoredUrlDatum(new TupleEntry(values.next()));
            FetchSetInfo setInfo = _policy.nextFetchSet(scoredDatum);
            if (setInfo != null) {
                FetchSetDatum result = makeFetchSetDatum(setInfo, newKey, safeHasNext());
                collector.add(BixoPlatform.clone(result.getTuple(), process));
            }
        }
       
        // See if we have another partially built datum to add.
        FetchSetInfo setInfo = _policy.endFetchSet();
        if (setInfo != null) {
            FetchSetDatum result = makeFetchSetDatum(setInfo, newKey, false);
            collector.add(BixoPlatform.clone(result.getTuple(), process));
        }
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

        final Fields testFields = new Fields("id", "name", "price", "inStock");
        String out = getTestDir() + "testIndexSink/out";

        DirectoryTap solrSink = new DirectoryTap(new SolrScheme(testFields, SOLR_CORE_DIR), out, SinkMode.REPLACE);
       
        TupleEntryCollector writer = solrSink.openForWrite(new LocalFlowProcess());

        for (int i = 0; i < 100; i++) {
            writer.add(new Tuple(i, "product #" + i, i * 1.0f, true));
        }

        writer.close();
    }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

        final String out = getTestDir() + "testSimpleIndexing/out";

        byte[] imageData = new byte[] {0, 1, 2, 3, 5};
       
        Tap source = makeSourceTap(testFields, in);
        TupleEntryCollector write = source.openForWrite(makeFlowProcess());
        Tuple t = new Tuple();
        t.add(1);
        t.add("TurboWriter 2.3");
        t.add(395.50f);
        t.add(new Tuple("wordprocessor", "Japanese"));
        t.add(true);
        t.add(imageData);
        write.add(t);
       
        t = new Tuple();
        t.add(2);
        t.add("Shasta 1.0");
        t.add(95.00f);
        t.add("Chinese");
        t.add(false);
       
        BytesWritable bw = new BytesWritable(imageData);
        bw.setCapacity(imageData.length + 10);
        t.add(bw);
        write.add(t);
        write.close();

        // Now read from the results, and write to a Solr index.
        Pipe writePipe = new Pipe("tuples to Solr");

        Tap solrSink = makeSolrSink(testFields, out);
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

  public void testSimpleFlow() throws Exception {
    // This tests reading from a file on HFS and writing the output tuples to HBase.
    // It makes sure that the tuples that result are serialized and deserialized properly.
    Fields inputFields = new Fields("num", "lower", "upper");
    TupleEntryCollector input = mHelper.makeCollectorForWrite("input", inputFields);

    // Set up the input.
    Tuple[] expected = new Tuple[] {
      new Tuple("1", "a", "b"),
      new Tuple("2", "test", "other"),
    };

    for (Tuple t : expected) {
      input.add(t);
    }
    input.close();

    // Create flow to read from local file and insert into HBase.
    Tap source = new Hfs(new SequenceFile(inputFields), mHelper.manageTemporaryPath("input"));

    Pipe pipe = new Pipe("values");
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

    File inputFile = new File(inputPath);
    if (inputFile.exists()) {
      throw new CascadingException("Input file " + inputPath + " already exists.");
    }
    Tap inputTap = new Hfs(new SequenceFile(fields), inputPath, SinkMode.REPLACE);
    TupleEntryCollector collector = inputTap.openForWrite(getJobConf());
    return collector;
  }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

  }

  public void operate(FlowProcess flow_process, BufferCall call) {
    ISeq resultSeq = RT.seq(invokeFunction(IteratorSeq
        .create(new TupleSeqConverter(call.getArgumentsIterator()))));
    TupleEntryCollector collector = call.getOutputCollector();
    while (resultSeq != null) {
      Object obj = resultSeq.first();
      collector.add(Util.coerceToTuple(obj));
      resultSeq = resultSeq.next();
    }
  }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

  }

  public void complete(FlowProcess flow_process, AggregatorCall ag_call) {
    Collection coll = (Collection) invokeFunction(ag_call.getContext());

    TupleEntryCollector collector = ag_call.getOutputCollector();

    if (coll != null) {
      for (Object o : coll) {
        collector.add(Util.coerceToTuple(o));
      }
    }
  }
View Full Code Here

Examples of cascading.tuple.TupleEntryCollector

  }

  public void operate(FlowProcess fp, FunctionCall call) {
    ISeq fnArgs = Util.coerceFromTuple(call.getArguments().getTuple());
    ISeq resultSeq = RT.seq(applyFunction(fnArgs));
    TupleEntryCollector collector = call.getOutputCollector();
    while (resultSeq != null) {
      Object obj = resultSeq.first();
      collector.add(Util.coerceToTuple(obj));
      resultSeq = resultSeq.next();
    }
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.