Package ivory.core.tokenize

Source Code of ivory.core.tokenize.DocumentProcessingUtils

/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package ivory.core.tokenize;

import ivory.core.data.dictionary.DefaultCachedFrequencySortedDictionary;
import ivory.core.data.document.TermDocVector;

import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;

import org.apache.log4j.Logger;

import com.google.common.collect.Maps;

import edu.umd.cloud9.collection.Indexable;
import edu.umd.cloud9.util.array.ArrayListOfInts;

/**
* @author Tamer Elsayed
* @author Jimmy Lin
*/
public class DocumentProcessingUtils {
  private static final Logger LOG = Logger.getLogger(DocumentProcessingUtils.class);

  public static short TF_CUT = Short.MAX_VALUE;

  public static SortedMap<Integer, int[]> integerizeTermDocVector(TermDocVector doc,
      DefaultCachedFrequencySortedDictionary termIDMap) {
    SortedMap<Integer, int[]> positions = Maps.newTreeMap();

    TermDocVector.Reader reader = null;
    try {
      reader = doc.getReader();
    } catch (IOException e1) {
      throw new RuntimeException("Error getting TermDocVectorReader: " + e1.getMessage());
    }

    while (reader.hasMoreTerms()) {
      int termid = termIDMap.getId(reader.nextTerm());
      if (termid <= 0) {
        continue;
      }

      positions.put(termid, reader.getPositions());
    }

    return positions;
  }

  public static Map<String, ArrayListOfInts> parseDocument(Indexable doc, Tokenizer tokenizer) {
    Map<String, ArrayListOfInts> positions = Maps.newHashMap();

    String text = doc.getContent();
    String[] terms = tokenizer.processContent(text);

    // The tokenizer may return terms with zero length (empty terms), and the tf may exceed the
    // capacity of a short (in which case we need to handle separately).

    for (int i = 0; i < terms.length; i++) {
      String term = terms[i];

      // Guard against bad tokenization
      if (term.length() == 0 || term.length() >= Byte.MAX_VALUE) {
        continue;
      }

      // Remember, token position is numbered started from one...
      if (positions.containsKey(term)) {
        positions.get(term).add(i + 1);
      } else {
        ArrayListOfInts l = new ArrayListOfInts();
        l.add(i + 1);
        positions.put(term, l);
      }
    }

    int doclength = 0;
    Iterator<Map.Entry<String, ArrayListOfInts>> it = positions.entrySet().iterator();
    Map.Entry<String, ArrayListOfInts> e;
    ArrayListOfInts positionsList;
    while (it.hasNext()) {
      e = it.next();
      positionsList = e.getValue();

      // We're storing tfs as shorts, so check for overflow...
      if (positionsList.size() >= TF_CUT) {
        // There are a few ways to handle this... If we're getting such a high tf, then it most
        // likely means that this is a junk doc.
        LOG.warn("Error: tf of " + e.getValue()
            + " will overflow max short value. docno=" + doc.getDocid() + ", term="
            + e.getKey());
        it.remove();
      } else {
        positionsList.trimToSize();
        doclength += positionsList.size();
      }
    }

    if ( positions.size() == 0 ) {
      return positions;
    }

    positions.put("", new ArrayListOfInts(new int[] { doclength }));
    return positions;
  }
}
TOP

Related Classes of ivory.core.tokenize.DocumentProcessingUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.