Package org.kitesdk.morphline.api

Source Code of org.kitesdk.morphline.api.CSVTokenizerTest

/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.morphline.api;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.junit.Assert;
import org.junit.Test;
import org.kitesdk.morphline.shaded.com.googlecode.jcsv.fastreader.CSVTokenizer;
import org.kitesdk.morphline.shaded.com.googlecode.jcsv.fastreader.QuotedCSVTokenizer;
import org.kitesdk.morphline.shaded.com.googlecode.jcsv.fastreader.SimpleCSVTokenizer;

import com.google.common.collect.Lists;

public class CSVTokenizerTest extends Assert {
 
  @Test
  public void testBasic() throws Exception {
    split(Arrays.asList("hello", "world"), "hello,world", ',', false, true);
    split(Arrays.asList(" hello", " world "), " hello| world ", '|', false, true);
    split(Arrays.asList("", "hello", "world", ""), "|hello|world|", '|', false, true);
    split(Arrays.asList(""), "", '|', false, true);
    split(Arrays.asList("", "", "x"), "||x", '|', false, true);
    split(Lists.newArrayList(), "", '|', false, false);
    split(Arrays.asList("x"), "x", '|', false, false);
    split(Arrays.asList(null,"x"), "|x", '|', false, false);
   
    Record record = new Record();
    CSVTokenizer tokenizer = new SimpleCSVTokenizer(',', true, true, new ArrayList<String>());
    tokenizer.tokenizeLine(" x ", null, record);
    assertEquals(Arrays.asList("x"), record.get("column0"));
  }
 
  private void split(List expected, String line, char separator, boolean isQuoted, boolean addEmptyStrings) throws IOException {
    Record record = new Record();
    CSVTokenizer tokenizer;
    if (isQuoted) {
      tokenizer = new QuotedCSVTokenizer(separator, false, addEmptyStrings, new ArrayList<String>(), 1000, false, '"');
      tokenizer.tokenizeLine(line, new BufferedReader(new StringReader("")), record);     
    } else {
      tokenizer = new SimpleCSVTokenizer(separator, false, addEmptyStrings, new ArrayList<String>());
      tokenizer.tokenizeLine(line, null, record);
    }
    for (int i = 0; i < expected.size(); i++) {
      assertEquals(expected.get(i), record.getFirstValue("column" + i));
    }
    assertTrue(record.getFields().asMap().size() <= expected.size());
  }
 
  @Test
  public void testQuoted() throws Exception {
    char quote = '"';
    String doubleQuote = String.valueOf(quote) + String.valueOf(quote);
    split(
        Arrays.asList("5", "orange", "This is a\nmulti, line text", "no\""),
        "5,orange,\"This is a\nmulti, line text\",no\"\"", ',', true, true);
    split(Arrays.asList("x", ""), "x|", '|', true, true);
    split(Arrays.asList("x", ""), "x|" + doubleQuote, '|', true, true);
    split(Lists.newArrayList(), doubleQuote, '|', true, true);
    split(Lists.newArrayList(), "", '|', true, true);
    split(Lists.newArrayList(), "", '|', true, false);
    split(Arrays.asList("x"), "x", '|', true, false);
    split(Arrays.asList(null,"x"), "|x", '|', true, false);
    split(Arrays.asList(null,"x"), doubleQuote + "|" + quote + "x" + quote, '|', true, false);
  }

  @Test(expected=IllegalStateException.class)
  public void testIllegalQuotedState() throws Exception {
    split(Lists.newArrayList(), "foo,maybe\"", ',', true, true);
  }

  @Test(expected=IllegalArgumentException.class)
  public void testThrowExceptionIfRecordTooLong() throws Exception {
    boolean ignoreTooLongRecords = false;
    int maxCharactersPerRecord = 10;
    CSVTokenizer tokenizer = new QuotedCSVTokenizer(
        ',', false, false, new ArrayList<String>(), maxCharactersPerRecord, ignoreTooLongRecords, '"');
    tokenizer.tokenizeLine(
        "\"",
        new BufferedReader(new StringReader("line tooooooooo long\"")),
        new Record());
  }

  @Test
  public void testIgnoreRecordTooLong() throws Exception {
    boolean ignoreTooLongRecords = true;
    int maxCharactersPerRecord = 10;
    CSVTokenizer tokenizer = new QuotedCSVTokenizer(
        ',', false, false, new ArrayList<String>(), maxCharactersPerRecord, ignoreTooLongRecords, '"');
    assertFalse(tokenizer.tokenizeLine(
        "\"",
        new BufferedReader(new StringReader("line tooooooooo long\"")),
        new Record()));
  }

}
TOP

Related Classes of org.kitesdk.morphline.api.CSVTokenizerTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.