Package org.apache.lucene.analysis.hunspell

Source Code of org.apache.lucene.analysis.hunspell.TestAllDictionaries

package org.apache.lucene.analysis.hunspell;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.File;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageEstimator;
import org.junit.Ignore;

/**
* Can be retrieved via:
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
* Note some of the files differ only in case. This may be a problem on your operating system!
*/
@Ignore("enable manually")
public class TestAllDictionaries extends LuceneTestCase {
 
  // set this to the location of where you downloaded all the files
  static final File DICTIONARY_HOME =
      new File("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
 
  final String tests[] = {
    /* zip file */               /* dictionary */       /* affix */
    "af_ZA.zip",                 "af_ZA.dic",           "af_ZA.aff",
    "ak_GH.zip",                 "ak_GH.dic",           "ak_GH.aff",
    "bg_BG.zip",                 "bg_BG.dic",           "bg_BG.aff",
    "ca_ANY.zip",                "catalan.dic",         "catalan.aff",
    "ca_ES.zip",                 "ca_ES.dic",           "ca_ES.aff",
// BUG: broken flag "cop_EG.zip",                "cop_EG.dic",          "cop_EG.aff",
    "cs_CZ.zip",                 "cs_CZ.dic",           "cs_CZ.aff",
    "cy_GB.zip",                 "cy_GB.dic",           "cy_GB.aff",
    "da_DK.zip",                 "da_DK.dic",           "da_DK.aff",
    "de_AT.zip",                 "de_AT.dic",           "de_AT.aff",
    "de_CH.zip",                 "de_CH.dic",           "de_CH.aff",
    "de_DE.zip",                 "de_DE.dic",           "de_DE.aff",
    "de_DE_comb.zip",            "de_DE_comb.dic",      "de_DE_comb.aff",
    "de_DE_frami.zip",           "de_DE_frami.dic",     "de_DE_frami.aff",
    "de_DE_neu.zip",             "de_DE_neu.dic",       "de_DE_neu.aff",
    "el_GR.zip",                 "el_GR.dic",           "el_GR.aff",
    "en_AU.zip",                 "en_AU.dic",           "en_AU.aff",
    "en_CA.zip",                 "en_CA.dic",           "en_CA.aff",
    "en_GB-oed.zip",             "en_GB-oed.dic",       "en_GB-oed.aff",
    "en_GB.zip",                 "en_GB.dic",           "en_GB.aff",
    "en_NZ.zip",                 "en_NZ.dic",           "en_NZ.aff",
    "eo.zip",                    "eo_l3.dic",           "eo_l3.aff",
    "eo_EO.zip",                 "eo_EO.dic",           "eo_EO.aff",
    "es_AR.zip",                 "es_AR.dic",           "es_AR.aff",
    "es_BO.zip",                 "es_BO.dic",           "es_BO.aff",
    "es_CL.zip",                 "es_CL.dic",           "es_CL.aff",
    "es_CO.zip",                 "es_CO.dic",           "es_CO.aff",
    "es_CR.zip",                 "es_CR.dic",           "es_CR.aff",
    "es_CU.zip",                 "es_CU.dic",           "es_CU.aff",
    "es_DO.zip",                 "es_DO.dic",           "es_DO.aff",
    "es_EC.zip",                 "es_EC.dic",           "es_EC.aff",
    "es_ES.zip",                 "es_ES.dic",           "es_ES.aff",
    "es_GT.zip",                 "es_GT.dic",           "es_GT.aff",
    "es_HN.zip",                 "es_HN.dic",           "es_HN.aff",
    "es_MX.zip",                 "es_MX.dic",           "es_MX.aff",
    "es_NEW.zip",                "es_NEW.dic",          "es_NEW.aff",
    "es_NI.zip",                 "es_NI.dic",           "es_NI.aff",
    "es_PA.zip",                 "es_PA.dic",           "es_PA.aff",
    "es_PE.zip",                 "es_PE.dic",           "es_PE.aff",
    "es_PR.zip",                 "es_PR.dic",           "es_PR.aff",
    "es_PY.zip",                 "es_PY.dic",           "es_PY.aff",
    "es_SV.zip",                 "es_SV.dic",           "es_SV.aff",
    "es_UY.zip",                 "es_UY.dic",           "es_UY.aff",
    "es_VE.zip",                 "es_VE.dic",           "es_VE.aff",
    "et_EE.zip",                 "et_EE.dic",           "et_EE.aff",
    "fo_FO.zip",                 "fo_FO.dic",           "fo_FO.aff",
    "fr_FR-1990_1-3-2.zip",      "fr_FR-1990.dic",      "fr_FR-1990.aff",
    "fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff",
    "fr_FR_1-3-2.zip",           "fr_FR.dic",           "fr_FR.aff",
    "fy_NL.zip",                 "fy_NL.dic",           "fy_NL.aff",
    "ga_IE.zip",                 "ga_IE.dic",           "ga_IE.aff",
    "gd_GB.zip",                 "gd_GB.dic",           "gd_GB.aff",
    "gl_ES.zip",                 "gl_ES.dic",           "gl_ES.aff",
    "gsc_FR.zip",                "gsc_FR.dic",          "gsc_FR.aff",
    "gu_IN.zip",                 "gu_IN.dic",           "gu_IN.aff",
    "he_IL.zip",                 "he_IL.dic",           "he_IL.aff",
    "hi_IN.zip",                 "hi_IN.dic",           "hi_IN.aff",
    "hil_PH.zip",                "hil_PH.dic",          "hil_PH.aff",
    "hr_HR.zip",                 "hr_HR.dic",           "hr_HR.aff",
    "hu_HU.zip",                 "hu_HU.dic",           "hu_HU.aff",
    "hu_HU_comb.zip",            "hu_HU.dic",           "hu_HU.aff",
    "ia.zip",                    "ia.dic",              "ia.aff",
    "id_ID.zip",                 "id_ID.dic",           "id_ID.aff",
    "it_IT.zip",                 "it_IT.dic",           "it_IT.aff",
    "ku_TR.zip",                 "ku_TR.dic",           "ku_TR.aff",
    "la.zip",                    "la.dic",              "la.aff",
    "lt_LT.zip",                 "lt_LT.dic",           "lt_LT.aff",
    "lv_LV.zip",                 "lv_LV.dic",           "lv_LV.aff",
    "mg_MG.zip",                 "mg_MG.dic",           "mg_MG.aff",
    "mi_NZ.zip",                 "mi_NZ.dic",           "mi_NZ.aff",
    "mk_MK.zip",                 "mk_MK.dic",           "mk_MK.aff",
    "mos_BF.zip",                "mos_BF.dic",          "mos_BF.aff",
    "mr_IN.zip",                 "mr_IN.dic",           "mr_IN.aff",
    "ms_MY.zip",                 "ms_MY.dic",           "ms_MY.aff",
    "nb_NO.zip",                 "nb_NO.dic",           "nb_NO.aff",
    "ne_NP.zip",                 "ne_NP.dic",           "ne_NP.aff",
    "nl_NL.zip",                 "nl_NL.dic",           "nl_NL.aff",
    "nl_med.zip",                "nl_med.dic",          "nl_med.aff",
    "nn_NO.zip",                 "nn_NO.dic",           "nn_NO.aff",
    "nr_ZA.zip",                 "nr_ZA.dic",           "nr_ZA.aff",
    "ns_ZA.zip",                 "ns_ZA.dic",           "ns_ZA.aff",
    "ny_MW.zip",                 "ny_MW.dic",           "ny_MW.aff",
    "oc_FR.zip",                 "oc_FR.dic",           "oc_FR.aff",
    "pl_PL.zip",                 "pl_PL.dic",           "pl_PL.aff",
    "pt_BR.zip",                 "pt_BR.dic",           "pt_BR.aff",
    "pt_PT.zip",                 "pt_PT.dic",           "pt_PT.aff",
    "ro_RO.zip",                 "ro_RO.dic",           "ro_RO.aff",
    "ru_RU.zip",                 "ru_RU.dic",           "ru_RU.aff",
    "ru_RU_ye.zip",              "ru_RU_ie.dic",        "ru_RU_ie.aff",
    "ru_RU_yo.zip",              "ru_RU_yo.dic",        "ru_RU_yo.aff",
    "rw_RW.zip",                 "rw_RW.dic",           "rw_RW.aff",
    "sk_SK.zip",                 "sk_SK.dic",           "sk_SK.aff",
    "sl_SI.zip",                 "sl_SI.dic",           "sl_SI.aff",
    "sq_AL.zip",                 "sq_AL.dic",           "sq_AL.aff",
    "ss_ZA.zip",                 "ss_ZA.dic",           "ss_ZA.aff",
    "st_ZA.zip",                 "st_ZA.dic",           "st_ZA.aff",
    "sv_SE.zip",                 "sv_SE.dic",           "sv_SE.aff",
    "sw_KE.zip",                 "sw_KE.dic",           "sw_KE.aff",
    "tet_ID.zip",                "tet_ID.dic",          "tet_ID.aff",
    "th_TH.zip",                 "th_TH.dic",           "th_TH.aff",
    "tl_PH.zip",                 "tl_PH.dic",           "tl_PH.aff",
    "tn_ZA.zip",                 "tn_ZA.dic",           "tn_ZA.aff",
    "ts_ZA.zip",                 "ts_ZA.dic",           "ts_ZA.aff",
    "uk_UA.zip",                 "uk_UA.dic",           "uk_UA.aff",
    "ve_ZA.zip",                 "ve_ZA.dic",           "ve_ZA.aff",
    "vi_VN.zip",                 "vi_VN.dic",           "vi_VN.aff",
    "xh_ZA.zip",                 "xh_ZA.dic",           "xh_ZA.aff",
    "zu_ZA.zip",                 "zu_ZA.dic",           "zu_ZA.aff",
  };
 
  public void test() throws Exception {
    for (int i = 0; i < tests.length; i += 3) {
      File f = new File(DICTIONARY_HOME, tests[i]);
      assert f.exists();
     
      try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
        ZipEntry dicEntry = zip.getEntry(tests[i+1]);
        assert dicEntry != null;
        ZipEntry affEntry = zip.getEntry(tests[i+2]);
        assert affEntry != null;
     
        try (InputStream dictionary = zip.getInputStream(dicEntry);
             InputStream affix = zip.getInputStream(affEntry)) {
          Dictionary dic = new Dictionary(affix, dictionary);
          System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
                             "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                             "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
                             "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
                             "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
                             "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
                             "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
        }
      }
    }
  }
 
  public void testOneDictionary() throws Exception {
    String toTest = "hu_HU.zip";
    for (int i = 0; i < tests.length; i++) {
      if (tests[i].equals(toTest)) {
        File f = new File(DICTIONARY_HOME, tests[i]);
        assert f.exists();
       
        try (ZipFile zip = new ZipFile(f, StandardCharsets.UTF_8)) {
          ZipEntry dicEntry = zip.getEntry(tests[i+1]);
          assert dicEntry != null;
          ZipEntry affEntry = zip.getEntry(tests[i+2]);
          assert affEntry != null;
       
          try (InputStream dictionary = zip.getInputStream(dicEntry);
               InputStream affix = zip.getInputStream(affEntry)) {
              new Dictionary(affix, dictionary);
          }
        }
      }
    }   
  }
}
TOP

Related Classes of org.apache.lucene.analysis.hunspell.TestAllDictionaries

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.