Source Code of org.zanata.util.OkapiUtil

/*
 * Copyright 2010, Red Hat, Inc. and individual contributors as indicated by the
 * @author tags. See the copyright.txt file in the distribution for a full
 * listing of individual contributors.
 *
 * This is free software; you can redistribute it and/or modify it under the
 * terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or (at your option)
 * any later version.
 *
 * This software is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this software; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA, or see the FSF
 * site: http://www.fsf.org.
 */
package org.zanata.util;


import java.io.ByteArrayInputStream;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import net.sf.okapi.common.LocaleId;
import net.sf.okapi.steps.tokenization.Tokenizer;
import net.sf.okapi.steps.tokenization.tokens.Tokens;


public class OkapiUtil {
    private static final Logger log = LoggerFactory.getLogger(OkapiUtil.class);


    private OkapiUtil() {
    }


    @SuppressWarnings("null")
    public static @Nonnull
    LocaleId toOkapiLocale(@Nonnull org.zanata.common.LocaleId zanataLocale) {
        return LocaleId.fromBCP47(zanataLocale.getId());
    }


    @SuppressWarnings("null")
    public static @Nonnull
    LocaleId toOkapiLocaleOrEmpty(@Nullable org.zanata.common.LocaleId locale) {
        if (locale == null) {
            // TMXWriter demands a non-null target locale, but if you write
            // your TUs with writeTUFull(), it is never actually used.
            return LocaleId.EMPTY;
        }
        return toOkapiLocale(locale);
    }


    /**
     * Count words using Okapi's WordCounter, which tries to implement the LISA
     * standard <a href=
     * "http://web.archive.org/web/20090403134742/http://www.lisa.org/Global-information-m.105.0.html"
     * >GMX-V</a>
     *
     * @param s
     * @param bcp47Locale
     * @return
     */
    public static long countWords(String s, String bcp47Locale) {
        if (s == null) {
            log.debug("null string");
            return 0;
        }
        try {
            LocaleId locale;
            try {
                locale = LocaleId.fromBCP47(bcp47Locale);
            } catch (Exception e) {
                log.error(
                        "can't understand '{}' as a BCP-47 locale; defaulting to English",
                        bcp47Locale);
                locale = LocaleId.ENGLISH;
            }


            Tokens tokens = StringTokenizer.tokenizeString(s, locale, "WORD");
            return tokens.size();
        } catch (Exception e) {
            Object[] args = new Object[] { s, bcp47Locale, e };
            log.error("unable to count words in string '{}' for locale '{}'",
                    args);
            return 0;
        }
    }


    /**
     * Extracts plain text from a TMX entry. This ignores the TMX elements that
     * mark up native code sequences: {@code
     * <bpt></bpt>
     * <ept></ept>
     * <it></it>
     * <ph></ph>
     * <seg></seg>}
     *
     * @param content
     *            The tmx marked up content.
     * @return A string with all tmx mark-up content stripped out. Essentially a
     *         plain text version of the string.
     */
    public static String removeFormattingMarkup(/* final */String content) {
        // The content must be a fully formed <seg> element with optional
        // attributes and with no leading/trailing whitespace
        assert content.startsWith("<seg") && content.endsWith("</seg>");


        try {
            XMLInputFactory inputFactory = XMLInputFactory.newFactory();
            inputFactory.setProperty(XMLInputFactory.IS_VALIDATING, false);
            inputFactory.setProperty(XMLInputFactory.SUPPORT_DTD, false);
            XMLEventReader reader =
                    inputFactory.createXMLEventReader(new ByteArrayInputStream(
                            content.getBytes()));
            StringBuilder writer = new StringBuilder();


            int ignoreLevel = 0; // Nesting level. When this is > 0 it means we
                                 // are ignoring events


            while (reader.hasNext()) {
                XMLEvent nextEv = reader.nextEvent();


                switch (nextEv.getEventType()) {
                case XMLStreamConstants.START_ELEMENT:
                    ignoreLevel =
                            handleStartElem(ignoreLevel,
                                    nextEv.asStartElement());
                    break;
                case XMLStreamConstants.END_ELEMENT:
                    ignoreLevel =
                            handleEndElem(ignoreLevel, nextEv.asEndElement());
                    break;
                case XMLStreamConstants.CHARACTERS:
                    if (ignoreLevel == 0)
                        writer.append(nextEv.asCharacters().getData());
                    break;
                }
            }


            return writer.toString();
        } catch (XMLStreamException e) {
            throw new RuntimeException(e);
        }
    }


    private static int handleStartElem(int ignoreLevel, StartElement startElem) {
        String elemName = startElem.getName().getLocalPart();
        if (ignoreElement(elemName)) {
            return ignoreLevel + 1;
        }
        return ignoreLevel;
    }


    private static int handleEndElem(int ignoreLevel, EndElement endElem) {
        String elemName = endElem.getName().getLocalPart();


        if (ignoreElement(elemName)) {
            if (ignoreLevel > 0) {
                return ignoreLevel - 1;
            }
        }
        return ignoreLevel;
    }


    private static boolean ignoreElement(String elemName) {
        // NB we do want the contents of 'hi' elements, but not these elements:
        return elemName.equals("bpt") || elemName.equals("ept")
                || elemName.equals("it") || elemName.equals("ph")
                || elemName.equals("sub");
    }


    private static class StringTokenizer extends Tokenizer {
        public static Tokens tokenizeString(String text, LocaleId language,
                String... tokenNames) {
            synchronized (Tokenizer.class) {
                return Tokenizer.tokenizeString(text, language, tokenNames);
            }
        }
    }


}
Source Code of org.zanata.util.OkapiUtil

Related Classes of org.zanata.util.OkapiUtil