Package org.apache.mahout.common.nlp

Examples of org.apache.mahout.common.nlp.NGrams


  @Override
  public void map(Text key, Text value,
                  OutputCollector<StringTuple,DoubleWritable> output, Reporter reporter) throws IOException {
    String label = key.toString();
   
    List<String> ngrams = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel();
   
    try {
      ClassifierResult result = classifier.classifyDocument(ngrams.toArray(new String[ngrams.size()]),
        defaultCategory);
     
View Full Code Here


       
        long lineNum = 0;
        for (String line : new FileLineIterable(new File(file.getPath()), Charset.forName(params
            .get("encoding")), false)) {
         
          Map<String,List<String>> document = new NGrams(line, Integer.parseInt(params.get("gramSize")))
              .generateNGrams();
          for (Map.Entry<String,List<String>> stringListEntry : document.entrySet()) {
            String correctLabel = stringListEntry.getKey();
            List<String> strings = stringListEntry.getValue();
            TimingStatistics.Call call = operationStats.newCall();
View Full Code Here

    ClassifierContext classifier = new ClassifierContext(algorithm, datastore);
    classifier.initialize();
    ResultAnalyzer resultAnalyzer = new ResultAnalyzer(classifier.getLabels(), params.get("defaultCat"));
   
    for (String[] entry : ClassifierData.DATA) {
      List<String> document = new NGrams(entry[1], Integer.parseInt(params.get("gramSize")))
          .generateNGramsWithoutLabel();
      assertEquals(3, classifier.classifyDocument(document.toArray(new String[document.size()]),
        params.get("defaultCat"), 100).length);
      ClassifierResult result = classifier.classifyDocument(document.toArray(new String[document.size()]), params
          .get("defaultCat"));
View Full Code Here

    Datastore datastore = new InMemoryBayesDatastore(params);
    ClassifierContext classifier = new ClassifierContext(algorithm, datastore);
    classifier.initialize();
    ResultAnalyzer resultAnalyzer = new ResultAnalyzer(classifier.getLabels(), params.get("defaultCat"));
    for (String[] entry : ClassifierData.DATA) {
      List<String> document = new NGrams(entry[1], Integer.parseInt(params.get("gramSize")))
          .generateNGramsWithoutLabel();
      assertEquals(3, classifier.classifyDocument(document.toArray(new String[document.size()]),
        params.get("defaultCat"), 100).length);
      ClassifierResult result = classifier.classifyDocument(document.toArray(new String[document.size()]), params
          .get("defaultCat"));
View Full Code Here

    StringBuilder line = new StringBuilder();
    for (String token : document) {
      line.append(token).append(' ');
    }
   
    List<String> doc = new NGrams(line.toString(), gramSize).generateNGramsWithoutLabel();
   
    log.info("Done converting");
    log.info("Classifying document: {}", docPath);
    ClassifierResult category = classifier.classifyDocument(doc.toArray(new String[doc.size()]), defaultCat);
    log.info("Category for {} is {}", docPath, category);
View Full Code Here

    //String line = value.toString();
    String label = key.toString();

    Map<String, int[]> wordList = new HashMap<String, int[]>(1000);

    List<String> ngrams  = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel();

    for (String ngram : ngrams) {
      int[] count = wordList.get(ngram);
      if (count == null) {
        count = new int[1];
View Full Code Here


    //StringBuilder builder = new StringBuilder(label);
    //builder.ensureCapacity(32);// make sure we have a reasonably size buffer to
                               // begin with
    List<String> ngrams  = new NGrams(value.toString(), gramSize).generateNGramsWithoutLabel();
   
    try {
      ClassifierResult result = classifier.classifyDocument( ngrams
          .toArray(new String[ngrams.size()]), defaultCategory);
    
View Full Code Here

        long lineNum = 0;
        for (String line : new FileLineIterable(new File(file.getPath()),
            Charset.forName(params.get("encoding")), false)) {

          Map<String, List<String>> document = new NGrams(line, Integer
              .parseInt(params.get("gramSize"))).generateNGrams();
          for (Map.Entry<String, List<String>> stringListEntry : document
              .entrySet()) {
            List<String> strings = stringListEntry.getValue();
            TimingStatistics.Call call = operationStats.newCall();
View Full Code Here

    StringBuilder line = new StringBuilder();
    for (String token : document) {
      line.append(token).append(' ');
    }

    List<String> doc = new NGrams(line.toString(), gramSize)
        .generateNGramsWithoutLabel();

    log.info("Done converting");
    log.info("Classifying document: {}", docPath);
    ClassifierResult category = classifier.classifyDocument(doc
View Full Code Here

    StringBuilder line = new StringBuilder();
    for (String token : document) {
      line.append(token).append(' ');
    }
   
    List<String> doc = new NGrams(line.toString(), gramSize).generateNGramsWithoutLabel();
   
    log.info("Done converting");
    log.info("Classifying document: {}", docPath);
    ClassifierResult category = classifier.classifyDocument(doc.toArray(new String[doc.size()]), defaultCat);
    log.info("Category for {} is {}", docPath, category);
View Full Code Here

TOP

Related Classes of org.apache.mahout.common.nlp.NGrams

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.