Package org.exoplatform.test

Source Code of org.exoplatform.test.HtmlCharsetDetector

/**
* Copyright (C) 2009 eXo Platform SAS.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/

/*
* DO NOT EDIT THIS DOCUMENT MANUALLY !!!
* THIS FILE IS AUTOMATICALLY GENERATED BY THE TOOLS UNDER
*    AutoDetect/tools/
*/
package org.exoplatform.test;

import org.exoplatform.component.test.AbstractGateInTest;
import org.exoplatform.services.chars.chardet.Detector;
import org.exoplatform.services.chars.chardet.ICharsetDetectionObserver;
import org.exoplatform.services.chars.chardet.PSMDetector;

import java.io.BufferedInputStream;
import java.net.URL;

/**
* Created by The eXo Platform SARL
* Author : Lai Van Khoi
*          laivankhoi46pm1@yahoo.com
* Nov 27, 2006 
*/
public class HtmlCharsetDetector extends AbstractGateInTest
{

   public static boolean found = false;

   public void testClass() throws Exception
   {
      String[] argv = new String[]{"http://www.laodong.com.vn/Home/khoilv/2006/9/4343.laodong", "6"};

      //if(argv.length!=1 && argv.length!=2){
      if (argv.length != 2)
      {
         System.out.println("Usage: HtmlCharacterDetector <url> [<languageHint>]");

         System.out.println("");
         System.out.println("Where <url> is http://...");
         System.out.println("For optional <languageHint>. Use the following...");
         System.out.println("    1 => Japanese");
         System.out.println("    2 => Chinese");
         System.out.println("    3 => Simplified Chinese");
         System.out.println("    4 => Traditional Chinese");
         System.out.println("    5 => Korean");
         System.out.println("    6 => Don't know (default)");

         return;
      }

      //Initialize the nsDetector();
      int lang = (argv.length == 2) ? Integer.parseInt(argv[1]) : PSMDetector.ALL;
      Detector det = new Detector(lang);

      //Set an observer...
      //The Notify() will be called when a matching charset is found.

      det.init(new ICharsetDetectionObserver()
      {
         public void notify(String charset)
         {
            HtmlCharsetDetector.found = true;
            System.out.println("CHARSET = " + charset);
         }
      });

      URL url = new URL(argv[0]);
      BufferedInputStream imp = new BufferedInputStream(url.openStream());

      byte[] buf = new byte[1024];
      int len;
      boolean done = false;
      boolean isAscii = true;

      while ((len = imp.read(buf, 0, buf.length)) != -1)
      {
         //Check if the stream is only ascii.
         if (isAscii)
            isAscii = det.isAscii(buf, len);

         //DoIt if non-ascii and not done yet.
         if (!isAscii && !done)
            done = det.doIt(buf, len, false);
      }
      det.dataEnd();

      if (isAscii)
      {
         System.out.println("CHARSET = ARSII");
         found = true;
      }

      if (!found)
      {
         String prob[] = det.getProbableCharsets();
         for (int i = 0; i < prob.length; i++)
         {
            System.out.println("Probable Charset = " + prob[i]);
         }
      }
   }
}
TOP

Related Classes of org.exoplatform.test.HtmlCharsetDetector

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.