Package org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor

Examples of org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article


    extractor = new ArticleExtractor();
  }
 
  @Override
  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
    if (article != null) {
      int groupId = WikipediaMapper.getPartitionId(article, numGroups);
      if(groupId != myGroup)
        return;
      context.write(new Text(language), article);
View Full Code Here


 
  static HashSet<String> metadataSent = new HashSet<String>();

  @Override
  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
    String NULL_BYTE = "\u0000";
    String colfPrefix = language + NULL_BYTE;
    String indexPrefix = "fi" + NULL_BYTE;
    if (article != null) {
      int groupId = WikipediaMapper.getPartitionId(article, numGroups);
      if(groupId != myGroup)
        return;
      Text partitionId = new Text(Integer.toString(WikipediaMapper.getPartitionId(article, numPartitions)));
     
      // Create the mutations for the document.
      // Row is partition id, colf is language0articleid, colq is fieldName\0fieldValue
      Mutation m = new Mutation(partitionId);
      for (Entry<String,Object> entry : article.getFieldValues().entrySet()) {
        m.put(colfPrefix + article.getId(), entry.getKey() + NULL_BYTE + entry.getValue().toString(), cv, article.getTimestamp(), NULL_VALUE);
        // Create mutations for the metadata table.
        String metadataKey = entry.getKey() + METADATA_EVENT_COLUMN_FAMILY + language;
        if (!metadataSent.contains(metadataKey)) {
          Mutation mm = new Mutation(entry.getKey());
          mm.put(METADATA_EVENT_COLUMN_FAMILY, language, cv, article.getTimestamp(), NULL_VALUE);
          context.write(metadataTableName, mm);
          metadataSent.add(metadataKey);
        }
      }
     
      // Tokenize the content
      Set<String> tokens = getTokens(article);
     
      // We are going to put the fields to be indexed into a multimap. This allows us to iterate
      // over the entire set once.
      Multimap<String,String> indexFields = HashMultimap.create();
      // Add the normalized field values
      LcNoDiacriticsNormalizer normalizer = new LcNoDiacriticsNormalizer();
      for (Entry<String,String> index : article.getNormalizedFieldValues().entrySet())
        indexFields.put(index.getKey(), index.getValue());
      // Add the tokens
      for (String token : tokens)
        indexFields.put(TOKENS_FIELD_NAME, normalizer.normalizeFieldValue("", token));
     
      for (Entry<String,String> index : indexFields.entries()) {
        // Create mutations for the in partition index
        // Row is partition id, colf is 'fi'\0fieldName, colq is fieldValue\0language\0article id
        m.put(indexPrefix + index.getKey(), index.getValue() + NULL_BYTE + colfPrefix + article.getId(), cv, article.getTimestamp(), NULL_VALUE);
       
        // Create mutations for the global index
        // Create a UID object for the Value
        Builder uidBuilder = Uid.List.newBuilder();
        uidBuilder.setIGNORE(false);
        uidBuilder.setCOUNT(1);
        uidBuilder.addUID(Integer.toString(article.getId()));
        Uid.List uidList = uidBuilder.build();
        Value val = new Value(uidList.toByteArray());
       
        // Create mutations for the global index
        // Row is field value, colf is field name, colq is partitionid\0language, value is Uid.List object
        Mutation gm = new Mutation(index.getValue());
        gm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
        context.write(indexTableName, gm);
       
        // Create mutations for the global reverse index
        Mutation grm = new Mutation(StringUtils.reverse(index.getValue()));
        grm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
        context.write(reverseIndexTableName, grm);
       
        // Create mutations for the metadata table.
        String metadataKey = index.getKey() + METADATA_INDEX_COLUMN_FAMILY + language;
        if (!metadataSent.contains(metadataKey)) {
          Mutation mm = new Mutation(index.getKey());
          mm.put(METADATA_INDEX_COLUMN_FAMILY, language + NULL_BYTE + LcNoDiacriticsNormalizer.class.getName(), cv, article.getTimestamp(), NULL_VALUE);
          context.write(metadataTableName, mm);
          metadataSent.add(metadataKey);
        }
      }
      // Add the entire text to the document section of the table.
      // row is the partition, colf is 'd', colq is language\0articleid, value is Base64 encoded GZIP'd document
      m.put(DOCUMENT_COLUMN_FAMILY, colfPrefix + article.getId(), cv, article.getTimestamp(), new Value(Base64.encodeBase64(article.getText().getBytes())));
      context.write(tablename, m);
     
    } else {
      context.getCounter("wikipedia", "invalid articles").increment(1);
    }
View Full Code Here

TOP

Related Classes of org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.