Package com.github.pmerienne.trident.ml.preprocessing

Examples of com.github.pmerienne.trident.ml.preprocessing.TwitterTokenizer


  @Test
  public void testRemoveUsername() {
    // Given
    String tweet = "@PrincessSuperC Hey Cici";
    TwitterTokenizer tokenizer = new TwitterTokenizer();

    // When
    List<String> actualTokens = tokenizer.tokenize(tweet);

    // Then
    List<String> expectedTokens = Arrays.asList("hei", "cici");
    assertEquals(expectedTokens, actualTokens);
  }
View Full Code Here


  @Test
  public void testRemoveHashTagIndicator() {
    // Given
    String tweet = "arg #word!";
    TwitterTokenizer tokenizer = new TwitterTokenizer();

    // When
    List<String> actualTokens = tokenizer.tokenize(tweet);

    // Then
    List<String> expectedTokens = Arrays.asList("arg", "word");
    assertEquals(expectedTokens, actualTokens);
  }
View Full Code Here

  @Test
  public void testRemoveCharacterRepetitions() {
    // Given
    String tweet = "so huuunggryy!";
    TwitterTokenizer tokenizer = new TwitterTokenizer();

    // When
    List<String> actualTokens = tokenizer.tokenize(tweet);

    // Then
    List<String> expectedTokens = Arrays.asList("so", "hungri");
    assertEquals(expectedTokens, actualTokens);
  }
View Full Code Here

  @Test
  public void testRemoveWordsStartingNumber() {
    // Given
    String tweet = "it's 15PM we're in the 20th century";
    TwitterTokenizer tokenizer = new TwitterTokenizer();

    // When
    List<String> actualTokens = tokenizer.tokenize(tweet);

    // Then
    List<String> expectedTokens = Arrays.asList("we'r", "centuri");
    assertEquals(expectedTokens, actualTokens);
  }
View Full Code Here

  @Test
  public void testRemoveHTMLMarkum() {
    // Given
    String tweet = "fb &gt; tw";
    TwitterTokenizer tokenizer = new TwitterTokenizer();

    // When
    List<String> actualTokens = tokenizer.tokenize(tweet);

    // Then
    List<String> expectedTokens = Arrays.asList("fb", "tw");
    assertEquals(expectedTokens, actualTokens);
  }
View Full Code Here

  @Test
  public void testWithNGram() {
    // Given
    String tweet = "it's not bad movie";
    TwitterTokenizer tokenizer = new TwitterTokenizer(2, 2);

    // When
    List<String> actualTokens = tokenizer.tokenize(tweet);

    // Then
    List<String> expectedTokens = Arrays.asList("_ bad", "bad", "bad movi", "movi");
    assertEquals(expectedTokens, actualTokens);
  }
View Full Code Here

    }
  }

  protected static void loadTwitterData() throws IOException {
    TWITTER_SAMPLES = new ArrayList<TextInstance<Boolean>>();
    TwitterTokenizer tokenizer = new TwitterTokenizer(2, 2);

    FileInputStream is = new FileInputStream(TWITTER_FILE);
    BufferedReader br = new BufferedReader(new InputStreamReader(is));
    try {
      String line;
      while ((line = br.readLine()) != null) {
        try {
          String[] values = line.split(",");

          Boolean label = !values[0].equals("0");
          String text = line.substring(line.indexOf(",") + 1);

          TWITTER_SAMPLES.add(new TextInstance<Boolean>(label, tokenizer.tokenize(text)));
        } catch (Exception ex) {
          System.err.println("Skipped twitter sample because it can't be parsed : " + line);
        }
      }
View Full Code Here

TOP

Related Classes of com.github.pmerienne.trident.ml.preprocessing.TwitterTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.