Package com.asakusafw.compiler.fileio

Source Code of com.asakusafw.compiler.fileio.HadoopFileIoProcessor

/**
* Copyright 2011-2014 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.compiler.fileio;

import java.io.IOException;
import java.text.MessageFormat;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;

import com.asakusafw.compiler.flow.ExternalIoDescriptionProcessor;
import com.asakusafw.compiler.flow.FlowCompilerOptions.GenericOptionValue;
import com.asakusafw.compiler.flow.Location;
import com.asakusafw.compiler.flow.jobflow.CompiledStage;
import com.asakusafw.compiler.flow.jobflow.ExternalIoStage;
import com.asakusafw.compiler.flow.mapreduce.copy.CopierClientEmitter;
import com.asakusafw.compiler.flow.mapreduce.copy.CopyDescription;
import com.asakusafw.compiler.flow.mapreduce.parallel.ParallelSortClientEmitter;
import com.asakusafw.compiler.flow.mapreduce.parallel.ResolvedSlot;
import com.asakusafw.compiler.flow.mapreduce.parallel.Slot;
import com.asakusafw.compiler.flow.mapreduce.parallel.SlotResolver;
import com.asakusafw.runtime.stage.input.TemporaryInputFormat;
import com.asakusafw.runtime.stage.output.TemporaryOutputFormat;
import com.asakusafw.utils.collections.Lists;
import com.asakusafw.utils.collections.Maps;
import com.asakusafw.utils.collections.Sets;
import com.asakusafw.vocabulary.external.ExporterDescription;
import com.asakusafw.vocabulary.external.FileExporterDescription;
import com.asakusafw.vocabulary.external.FileImporterDescription;
import com.asakusafw.vocabulary.external.ImporterDescription;
import com.asakusafw.vocabulary.flow.graph.InputDescription;
import com.asakusafw.vocabulary.flow.graph.OutputDescription;

/**
* ファイルの入出力を処理する。
* @since 0.1.0
*/
public class HadoopFileIoProcessor extends ExternalIoDescriptionProcessor {

    private static final Pattern VALID_OUTPUT_NAME = Pattern.compile("[0-9A-Za-z]+");

    private static final String MODULE_NAME = "fileio";

    /**
     * The option name for {@link FileExporterDescription} is enabled.
     */
    public static final String OPTION_EXPORTER_ENABLED = "MAPREDUCE-370";

    private static final GenericOptionValue DEFAULT_EXPORTER_ENABLED = GenericOptionValue.AUTO;

    @Override
    public String getId() {
        return MODULE_NAME;
    }

    @Override
    public Class<? extends ImporterDescription> getImporterDescriptionType() {
        return FileImporterDescription.class;
    }

    @Override
    public Class<? extends ExporterDescription> getExporterDescriptionType() {
        return FileExporterDescription.class;
    }

    @Override
    public boolean validate(List<InputDescription> inputs, List<OutputDescription> outputs) {
        boolean valid = validateOutputs(outputs);
        return valid;
    }

    private boolean validateOutputs(List<OutputDescription> outputs) {
        assert outputs != null;
        boolean valid = true;
        GenericOptionValue exporterEnabled = getEnvironment().getOptions().getGenericExtraAttribute(
                OPTION_EXPORTER_ENABLED,
                DEFAULT_EXPORTER_ENABLED);
        if (exporterEnabled == GenericOptionValue.INVALID) {
            getEnvironment().error(
                    "Invalid valud for compiler option \"{0}\" ({1}), this must be {2}",
                    getEnvironment().getOptions().getExtraAttributeKeyName(OPTION_EXPORTER_ENABLED),
                    getEnvironment().getOptions().getExtraAttribute(OPTION_EXPORTER_ENABLED),
                    GenericOptionValue.ENABLED.getSymbol() + "|" + GenericOptionValue.DISABLED.getSymbol());
            exporterEnabled = DEFAULT_EXPORTER_ENABLED;
            valid = false;
        }
        boolean mr370applied = checkClassExists("org.apache.hadoop.mapreduce.lib.output.MultipleOutputs");
        for (OutputDescription output : outputs) {
            FileExporterDescription desc = extract(output);
            if (exporterEnabled == GenericOptionValue.DISABLED) {
                valid = false;
                getEnvironment().error(
                        "出力{0}を利用するにはコンパイルオプション\"{1}={2}\"の指定が必要です",
                        desc.getClass().getName(),
                        getEnvironment().getOptions().getExtraAttributeKeyName(OPTION_EXPORTER_ENABLED),
                        GenericOptionValue.ENABLED.getSymbol());
            } else if (mr370applied == false && exporterEnabled == GenericOptionValue.AUTO) {
                valid = false;
                getEnvironment().error(
                        "現在のディストリビューションは{1}に対応していません。"
                        + "別のディストリビューションを利用するか、{2}に置き換えてください (出力{0})。",
                        desc.getClass().getName(),
                        FileExporterDescription.class.getSimpleName(),
                        "DirectFileOutputDescription (directio)");
            }
            String pathPrefix = desc.getPathPrefix();
            if (pathPrefix == null) {
                valid = false;
                getEnvironment().error(
                        "{0}のパスが指定されていません",
                        desc.getClass().getName());
            } else {
                Location location = Location.fromPath(pathPrefix, '/');
                if (location.isPrefix() == false) {
                    valid = false;
                    getEnvironment().error(
                            "{0}はパスの接尾辞(-*)でなければなりません: {1}",
                            desc.getClass().getName(),
                            pathPrefix);
                }
                if (location.getParent() == null) {
                    valid = false;
                    getEnvironment().error(
                            "{0}には最低ひとつのディレクトリの指定が必要です: {1}",
                            desc.getClass().getName(),
                            pathPrefix);
                }
                if (VALID_OUTPUT_NAME.matcher(location.getName()).matches() == false) {
                    valid = false;
                    getEnvironment().error(
                            "{0}のファイル名(末尾のセグメント)は英数字のみ利用できます: {1}",
                            desc.getClass().getName(),
                            pathPrefix);
                }
            }
        }
        return valid;
    }
    private boolean checkClassExists(String className) {
        try {
            Class.forName(className);
            return true;
        } catch (ClassNotFoundException e) {
            return false;
        }
    }

    @Override
    public SourceInfo getInputInfo(InputDescription description) {
        FileImporterDescription desc = extract(description);
        if (isCacheTarget(desc)) {
            String outputName = getProcessedInputName(description);
            Location location = getEnvironment().getPrologueLocation(MODULE_NAME).append(outputName).asPrefix();
            return new SourceInfo(Collections.singleton(location), TemporaryInputFormat.class);
        } else {
            return getOrifinalInputInfo(desc);
        }
    }

    private SourceInfo getOrifinalInputInfo(FileImporterDescription desc) {
        assert desc != null;
        Set<Location> locations = Sets.create();
        for (String path : desc.getPaths()) {
            locations.add(Location.fromPath(path, '/'));
        }
        return new SourceInfo(locations, desc.getInputFormat());
    }

    private boolean isCacheTarget(ImporterDescription desc) {
        assert desc != null;
        switch (desc.getDataSize()) {
        case TINY:
            return getEnvironment().getOptions().isHashJoinForTiny();
        case SMALL:
            return getEnvironment().getOptions().isHashJoinForSmall();
        default:
            return false;
        }
    }

    private String getProcessedInputName(InputDescription description) {
        assert description != null;
        StringBuilder buf = new StringBuilder();
        for (char c : description.getName().toCharArray()) {
            // 0 as escape character
            if ('1' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z') {
                buf.append(c);
            } else if (c <= 0xff) {
                buf.append('0');
                buf.append(String.format("%02x", (int) c));
            } else {
                buf.append("0u");
                buf.append(String.format("%04x", (int) c));
            }
        }
        return buf.toString();
    }

    @Override
    public List<ExternalIoStage> emitPrologue(IoContext context) throws IOException {
        List<CopyDescription> targets = Lists.create();
        for (Input input : context.getInputs()) {
            InputDescription description = input.getDescription();
            FileImporterDescription desc = extract(description);
            if (isCacheTarget(desc)) {
                targets.add(new CopyDescription(
                        getProcessedInputName(description),
                        getEnvironment().getDataClasses().load(description.getDataType()),
                        getOrifinalInputInfo(desc),
                        TemporaryOutputFormat.class));
            }
        }
        if (targets.isEmpty()) {
            return Collections.emptyList();
        }
        CopierClientEmitter emitter = new CopierClientEmitter(getEnvironment());
        CompiledStage stage = emitter.emitPrologue(
                MODULE_NAME,
                targets,
                getEnvironment().getPrologueLocation(MODULE_NAME));
        return Collections.singletonList(new ExternalIoStage(getId(), stage, context.getInputContext()));
    }

    @Override
    public List<ExternalIoStage> emitEpilogue(IoContext context) throws IOException {
        Set<String> saw = Sets.create();
        List<ExternalIoStage> results = Lists.create();
        for (Map.Entry<Location, List<Slot>> entry : groupByOutputLocation(context).entrySet()) {
            List<Slot> slots = entry.getValue();
            List<ResolvedSlot> resolved = new SlotResolver(getEnvironment()).resolve(slots);
            if (getEnvironment().hasError()) {
                return Collections.emptyList();
            }
            ParallelSortClientEmitter emitter = new ParallelSortClientEmitter(getEnvironment());
            String moduleId = generateModuleName(saw, entry.getKey());
            CompiledStage stage = emitter.emit(moduleId, resolved, entry.getKey());
            // TODO not sure
            results.add(new ExternalIoStage(getId(), stage, context.getOutputContext()));
        }
        return results;
    }

    private String generateModuleName(Set<String> saw, Location target) {
        assert saw != null;
        assert target != null;
        String simpleSuffix = generateSuffix(target);
        String baseModuleId = MessageFormat.format("{0}.{1}", MODULE_NAME, simpleSuffix);
        if (saw.contains(baseModuleId) == false) {
            saw.add(baseModuleId);
            return baseModuleId;
        }
        int index = 1;
        while (true) {
            String moduleIdCandidate = baseModuleId + index;
            if (saw.contains(moduleIdCandidate) == false) {
                saw.add(moduleIdCandidate);
                return moduleIdCandidate;
            }
            index++;
        }
    }

    private String generateSuffix(Location target) {
        assert target != null;
        String name = target.getName();
        if (name.isEmpty()) {
            return "_";
        }
        StringBuilder buf = new StringBuilder();
        if (Character.isJavaIdentifierStart(name.charAt(0)) == false) {
            buf.append('_');
        }
        for (char c : name.toCharArray()) {
            if (Character.isJavaIdentifierPart(c)) {
                buf.append(c);
            }
        }
        assert buf.length() >= 1;
        return buf.toString();
    }

    private Map<Location, List<Slot>> groupByOutputLocation(IoContext context) {
        assert context != null;
        Map<Location, List<Slot>> results = new TreeMap<Location, List<Slot>>(new Comparator<Location>() {
            @Override
            public int compare(Location o1, Location o2) {
                // o1.parent が o2.parent の祖先パスである場合に、o1がo2より手前に来るように並び替える。
                // これは、Hadoopの出力先にディレクトリが既に存在する場合にエラーとするため。
                // 逆もまた然り。

                // 親パスを文字列で比較
                // AがBの祖先パス => A.toString < B.toString
                // と言う関係をもとに、親パスの文字列が異なればその順序で整列
                String parentPath1 = (o1.getParent() == null) ? "" : o1.getParent().toPath('/');
                String parentPath2 = (o2.getParent() == null) ? "" : o2.getParent().toPath('/');
                int parentDiff = parentPath1.compareTo(parentPath2);
                if (parentDiff != 0) {
                    return (parentDiff > 0) ? +1 : -1;
                }

                // 親パスまでが同じなので名前のみ比較
                return o1.getName().compareTo(o2.getName());
            }
        });
        for (Output output : context.getOutputs()) {
            FileExporterDescription desc = extract(output.getDescription());
            Location path = Location.fromPath(desc.getPathPrefix(), '/');
            Location parent = path.getParent();
            Maps.addToList(results, parent, toSlot(output, path.getName()));
        }
        return results;
    }

    private Slot toSlot(Output output, String name) {
        assert output != null;
        assert name != null;
        return new Slot(
                name,
                output.getDescription().getDataType(),
                Collections.<String>emptyList(),
                output.getSources(),
                extract(output.getDescription()).getOutputFormat());
    }

    private FileImporterDescription extract(InputDescription description) {
        assert description != null;
        ImporterDescription importer = description.getImporterDescription();
        assert importer != null;
        assert importer instanceof FileImporterDescription;
        return (FileImporterDescription) importer;
    }

    private FileExporterDescription extract(OutputDescription description) {
        assert description != null;
        ExporterDescription exporter = description.getExporterDescription();
        assert exporter != null;
        assert exporter instanceof FileExporterDescription;
        return (FileExporterDescription) exporter;
    }
}
TOP

Related Classes of com.asakusafw.compiler.fileio.HadoopFileIoProcessor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.