Source Code of org.zanata.adapter.OkapiFilterAdapter

/*
 * Copyright 2012, Red Hat, Inc. and individual contributors
 * as indicated by the @author tags. See the copyright.txt file in the
 * distribution for a full listing of individual contributors.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.zanata.adapter;


import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.exceptions.OkapiIOException;
import net.sf.okapi.common.filters.IFilter;
import net.sf.okapi.common.filterwriter.GenericContent;
import net.sf.okapi.common.filterwriter.IFilterWriter;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.StartSubDocument;
import net.sf.okapi.common.resource.TextUnit;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.zanata.common.ContentState;
import org.zanata.common.ContentType;
import org.zanata.common.HasContents;
import org.zanata.common.LocaleId;
import org.zanata.exception.FileFormatAdapterException;
import org.zanata.rest.dto.resource.Resource;
import org.zanata.rest.dto.resource.TextFlow;
import org.zanata.rest.dto.resource.TextFlowTarget;
import org.zanata.rest.dto.resource.TranslationsResource;
import org.zanata.util.HashUtil;


import com.google.common.base.Optional;


/**
 * An adapter that uses a provided {@link IFilter} implementation to parse
 * documents.
 *
 * @author David Mason, <a
 *         href="mailto:damason@redhat.com">damason@redhat.com</a>
 *
 */
public class OkapiFilterAdapter implements FileFormatAdapter {
    private Logger log;


    /**
     * Determines how TextFlow ids are assigned for Okapi TextUnits
     */
    public enum IdSource {
        /** use ID of TextUnit as is (only unique if no sub-documents) */
        textUnitId,
        /**
         * use 'name' attribute of TextUnit, if any. Not guaranteed to be
         * unique.
         */
        textUnitName,
        /**
         * use a hash of string content (similar to resId for gettext projects).
         * Not guaranteed to be unique.
         */
        contentHash,
        /**
         * concatenate name of sub-document and ID of TextUnit. Should be unique
         * (assuming sub-document names are).
         */
        subDocNameAndTextUnitId
    };


    private final IFilter filter;
    private final IdSource idSource;
    private boolean requireFileOutput;
    private boolean separateNonTranslatable;


    /**
     * Create an adapter that will use the specified {@link IdSource} as
     * TextFlow id.
     *
     * @param filter
     *            {@link IFilter} used to parse the document
     * @param idSource
     *            determines how ids are assigned to TextFlows. The chosen
     *            source should only produce duplicate ids when source content
     *            is identical.
     */
    public OkapiFilterAdapter(IFilter filter, IdSource idSource) {
        this(filter, idSource, false);
    }


    /**
     * Create an adapter that will use the specified {@link IdSource} as
     * TextFlow id.
     *
     * @param filter
     *            {@link IFilter} used to parse the document
     * @param idSource
     *            determines how ids are assigned to TextFlows The chosen source
     *            should only produce duplicate ids when source content is
     *            identical.
     * @param requireFileOutput
     *            true if filter requires a file on disk rather than just a
     *            stream. Causes a temp file to be created when parsing.
     */
    public OkapiFilterAdapter(IFilter filter, IdSource idSource,
            boolean requireFileOutput) {
        this(filter, idSource, requireFileOutput, false);
    }


    public OkapiFilterAdapter(IFilter filter, IdSource idSource,
            boolean requireFileOutput, boolean separateNonTranslatable) {
        this.filter = filter;
        this.idSource = idSource;
        this.requireFileOutput = requireFileOutput;
        this.separateNonTranslatable = separateNonTranslatable;


        log = LoggerFactory.getLogger(OkapiFilterAdapter.class);
    }


    @Override
    public Resource parseDocumentFile(URI documentContent,
            LocaleId sourceLocale, Optional<String> filterParams)
            throws FileFormatAdapterException, IllegalArgumentException {
        // null documentContent is handled by RawDocument constructor
        if (sourceLocale == null) {
            throw new IllegalArgumentException("Source locale cannot be null");
        }


        Resource document = new Resource();
        document.setLang(sourceLocale);
        document.setContentType(ContentType.TextPlain);


        List<TextFlow> resources = document.getTextFlows();
        Map<String, HasContents> addedResources =
                new HashMap<String, HasContents>();


        RawDocument rawDoc =
                new RawDocument(documentContent, "UTF-8",
                        net.sf.okapi.common.LocaleId.fromString("en"));
        updateParams(filterParams);
        try {
            filter.open(rawDoc);
            String subDocName = "";
            while (filter.hasNext()) {
                Event event = filter.next();
                if (event.getEventType() == EventType.START_SUBDOCUMENT) {
                    StartSubDocument startSubDoc =
                            (StartSubDocument) event.getResource();
                    subDocName = stripPath(startSubDoc.getName());
                } else if (event.getEventType() == EventType.TEXT_UNIT) {
                    TextUnit tu = (TextUnit) event.getResource();
                    if (!tu.getSource().isEmpty() && tu.isTranslatable()) {
                        String content = getTranslatableText(tu);
                        if (!content.isEmpty()) {
                            TextFlow tf =
                                    new TextFlow(getIdFor(tu, content,
                                            subDocName), sourceLocale);
                            tf.setPlural(false);
                            tf.setContents(content);
                            if (shouldAdd(tf.getId(), tf, addedResources)) {
                                addedResources.put(tf.getId(), tf);
                                resources.add(tf);
                            }
                        }
                    }
                }
            }
        } catch (OkapiIOException e) {
            throw new FileFormatAdapterException("Unable to parse document", e);
        } finally {
            filter.close();
        }
        return document;
    }


    private String getTranslatableText(TextUnit tu) {
        String letterCodedText =
                GenericContent.fromFragmentToLetterCoded(tu.getSource()
                        .getFirstContent(), true);
        if (separateNonTranslatable) {
            return getPartitionedText(letterCodedText).get("str");
        } else {
            return letterCodedText;
        }
    }


    /**
     * Separates translatable text from surrounding non-translatable text.
     *
     * @param tu
     * @return
     */
    private Map<String, String> getPartitionedText(TextUnit tu) {
        return TranslatableSeparator.separate(GenericContent
                .fromFragmentToLetterCoded(tu.getSource().getFirstContent(),
                        true));
    }


    /**
     * Separates translatable text from surrounding non-translatable text.
     *
     * @param tu
     * @return
     */
    private Map<String, String> getPartitionedText(String letterCodedText) {
        return TranslatableSeparator.separate(letterCodedText);
    }


    /**
     * Check whether a TextFlow or TextFlowTarget should be added given the
     * current rules and state.
     *
     * @param id
     *            of the source string
     * @param hc
     *            the TextFlow or TextFlowTarget to add
     * @param addedResources
     *            record of the strings that have been added so far.
     * @return true if a string with the same id does not exist in
     *         addedResources
     * @throws FileFormatAdapterException
     *             if a duplicate is found when elideDuplicates is false, or if
     *             duplicates do not have identical contents.
     */
    private boolean shouldAdd(String id, HasContents hc,
            Map<String, HasContents> addedResources)
            throws FileFormatAdapterException {
        if (addedResources.containsKey(id)) {
            if (!hc.getContents().equals(addedResources.get(id).getContents())) {
                throw new FileFormatAdapterException(
                        "Same id but different contents for text text flow, "
                                + "not suitable for eliding.");
            }
            return false;
        }
        return true;
    }


    private String stripPath(String name) {
        if (name.contains("/") && !name.endsWith("/")) {
            return name.substring(name.lastIndexOf('/') + 1);
        } else {
            return name;
        }
    }


    @Override
    public TranslationsResource parseTranslationFile(URI fileUri,
            String localeId, Optional<String> filterParams)
            throws FileFormatAdapterException, IllegalArgumentException {
        if (localeId == null || localeId.isEmpty()) {
            throw new IllegalArgumentException(
                    "locale id string cannot be null or empty");
        }


        RawDocument rawDoc =
                new RawDocument(fileUri, "UTF-8",
                        net.sf.okapi.common.LocaleId.fromString("en"));
        return parseTranslationFile(rawDoc, filterParams);
    }


    private TranslationsResource parseTranslationFile(RawDocument rawDoc,
            Optional<String> params) {
        TranslationsResource transRes = new TranslationsResource();
        List<TextFlowTarget> translations = transRes.getTextFlowTargets();


        Map<String, HasContents> addedResources =
                new HashMap<String, HasContents>();
        updateParams(params);
        try {
            filter.open(rawDoc);
            String subDocName = "";
            while (filter.hasNext()) {
                Event event = filter.next();
                if (event.getEventType() == EventType.START_SUBDOCUMENT) {
                    StartSubDocument startSubDoc =
                            (StartSubDocument) event.getResource();
                    subDocName = stripPath(startSubDoc.getName());
                } else if (event.getEventType() == EventType.TEXT_UNIT) {
                    TextUnit tu = (TextUnit) event.getResource();
                    if (!tu.getSource().isEmpty() && tu.isTranslatable()) {
                        String content = getTranslatableText(tu);
                        if (!content.isEmpty()) {
                            TextFlowTarget tft =
                                    new TextFlowTarget(getIdFor(tu, content, subDocName));
                            tft.setContents(content);
                            tft.setState(ContentState.NeedReview);
                            if (shouldAdd(tft.getResId(), tft, addedResources)) {
                                addedResources.put(tft.getResId(), tft);
                                translations.add(tft);
                            }


                        }
                    }
                }
            }
        } catch (OkapiIOException e) {
            throw new FileFormatAdapterException(
                    "Unable to parse translation file", e);
        } finally {
            filter.close();
        }
        return transRes;
    }


    @Override
    public void writeTranslatedFile(OutputStream output, URI originalFile,
            Map<String, TextFlowTarget> translations, String locale,
            Optional<String> params) throws FileFormatAdapterException,
            IllegalArgumentException {
        net.sf.okapi.common.LocaleId localeId =
                net.sf.okapi.common.LocaleId.fromString(locale);
        IFilterWriter writer = filter.createFilterWriter();
        writer.setOptions(localeId, getOutputEncoding());


        if (requireFileOutput) {
            writeTranslatedFileWithFileOutput(output, originalFile,
                    translations, localeId, writer, params);
        } else {
            writer.setOutput(output);
            generateTranslatedFile(originalFile, translations, localeId,
                    writer, params);
        }
    }


    protected String getOutputEncoding() {
        return "UTF-8";
    }


    private void writeTranslatedFileWithFileOutput(OutputStream output,
            URI originalFile, Map<String, TextFlowTarget> translations,
            net.sf.okapi.common.LocaleId localeId, IFilterWriter writer,
            Optional<String> params) {
        File tempFile = null;


        try {
            tempFile = File.createTempFile("filename", "extension");
            writer.setOutput(tempFile.getCanonicalPath());
            generateTranslatedFile(originalFile, translations, localeId,
                    writer, params);


            byte[] buffer = new byte[4096]; // To hold file contents
            int bytesRead;
            FileInputStream input = new FileInputStream(tempFile);
            while ((bytesRead = input.read(buffer)) != -1) {
                output.write(buffer, 0, bytesRead);
            }
        } catch (IOException e) {
            // FIXME log
            throw new FileFormatAdapterException(
                    "Unable to generate translated file", e);
        } catch (SecurityException e) {
            // FIXME log
            throw new FileFormatAdapterException(
                    "Unable to generate translated file", e);
        } finally {
            if (tempFile != null) {
                if (!tempFile.delete()) {
                    log.warn(
                            "unable to remove temporary file {}, marked for delete on exit",
                            tempFile.getAbsolutePath());
                    tempFile.deleteOnExit();
                }
            }
        }


    }


    private void generateTranslatedFile(URI originalFile,
            Map<String, TextFlowTarget> translations,
            net.sf.okapi.common.LocaleId localeId, IFilterWriter writer,
            Optional<String> params) {
        RawDocument rawDoc =
                new RawDocument(originalFile, "UTF-8",
                        net.sf.okapi.common.LocaleId.fromString("en"));
        updateParams(params);
        try {
            filter.open(rawDoc);
            String subDocName = "";
            while (filter.hasNext()) {
                Event event = filter.next();
                if (event.getEventType() == EventType.START_SUBDOCUMENT) {
                    StartSubDocument startSubDoc =
                            (StartSubDocument) event.getResource();
                    subDocName = stripPath(startSubDoc.getName());
                } else if (event.getEventType() == EventType.TEXT_UNIT) {
                    TextUnit tu = (TextUnit) event.getResource();
                    if (!tu.getSource().isEmpty() && tu.isTranslatable()) {
                        String translatable = getTranslatableText(tu);


                        if (!translatable.isEmpty()) {
                            TextFlowTarget tft =
                                    translations.get(getIdFor(tu,
                                            translatable, subDocName));


                            if (tft != null) {
                                String translated = tft.getContents().get(0);


                                translated =
                                        getFullTranslationText(tu, translated);
                                tu.setTargetContent(localeId, GenericContent
                                        .fromLetterCodedToFragment(
                                                translated, tu.getSource()
                                                .getFirstContent()
                                                .clone(), true, true));
                            }
                        }
                    }


                }
                writer.handleEvent(event);
            }
        } catch (OkapiIOException e) {
            throw new FileFormatAdapterException(
                    "Unable to generate translated document from original", e);
        } finally {
            filter.close();
            writer.close();
        }
    }


    private String getFullTranslationText(TextUnit tu, String translated) {
        if (separateNonTranslatable) {
            Map<String, String> partitionedContent = getPartitionedText(tu);
            return partitionedContent.get("pre") + translated
                    + partitionedContent.get("suf");
        } else {
            return translated;
        }
    }


    /**
     * Return the id for a TextUnit based on id assignment rules. This method
     * can be overridden for more complex id assignment.
     *
     * @param tu
     *            for which to get id
     * @return the id for the given tu
     */
    protected String getIdFor(TextUnit tu, String subDocName) {
        return getIdFor(tu, tu.getSource().toString(), subDocName);
    }


    protected String getIdFor(TextUnit tu, String content, String subDocName) {
        switch (idSource) {
        case contentHash:
            return HashUtil.generateHash(content);
        case textUnitName:
            return tu.getName();
        case subDocNameAndTextUnitId:
            return subDocName + ":" + tu.getId();
        case textUnitId:
        default:
            return tu.getId();
        }
    }


    private void updateParams(Optional<String> params) {
        filter.getParameters().reset();
        updateParamsWithDefaults(filter.getParameters());
        if (params.isPresent()) {
            filter.getParameters().fromString(params.get());
        }
    }


    protected void updateParamsWithDefaults(IParameters params) {
        // default empty implementation is provided so that subclasses are not
        // forced to override when defaults are not needed.
    }


}
Source Code of org.zanata.adapter.OkapiFilterAdapter

Related Classes of org.zanata.adapter.OkapiFilterAdapter