Package org.exist.util

Source Code of org.exist.util.Collations

/*
* eXist Open Source Native XML Database
* Copyright (C) 2004-2009 The eXist Project
* http://exist-db.org
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*  $Id$
*/
package org.exist.util;

import java.net.URI;
import java.net.URISyntaxException;
import java.text.CollationElementIterator;
import java.text.Collator;
import java.text.ParseException;
import java.text.RuleBasedCollator;
import java.util.Comparator;
import java.util.Locale;
import java.util.StringTokenizer;

import org.apache.log4j.Logger;
import org.exist.xquery.Constants;
import org.exist.xquery.ErrorCodes;
import org.exist.xquery.XPathException;
import org.exist.xquery.XQueryContext;

/**
* Utility methods dealing with collations.
*
* @author wolf
*/
public class Collations {

    private final static Logger logger = Logger.getLogger(Collations.class);

    /**
     * The default unicode codepoint collation URI as defined by the XQuery
     * spec.
     */
    //public final static String CODEPOINT = "http://www.w3.org/2004/07/xpath-functions/collation/codepoint";
    public final static String CODEPOINT = "http://www.w3.org/2005/xpath-functions/collation/codepoint";

    /**
     * Short string to select the default codepoint collation
     */
    public final static String CODEPOINT_SHORT = "codepoint";

    /**
     * The URI used to select collations in eXist.
     */
    public final static String EXIST_COLLATION_URI = "http://exist-db.org/collation";

    /**
     * Get a {@link Comparator}from the specified URI.
     *
     * The original code is from saxon (@linkplain http://saxon.sf.net).
     *
     * @param uri
     * @throws XPathException
     */
    public final static Collator getCollationFromURI(XQueryContext context,
            String uri)
        throws XPathException {
        if (uri.startsWith(EXIST_COLLATION_URI) || uri.startsWith("?")) {
            URI u = null;
            try {
                u = new URI(uri);
            } catch (final URISyntaxException e) {
                return null;
            }
            final String query = u.getQuery();
            String strength = null;
            /*
             * Check if the db broker is configured to be case insensitive. If
             * yes, we assume "primary" strength unless the user specified
             * something different.
             *
             * TODO: bad idea: using primary strength as default also ignores
             * German Umlaute.
             */
            // if(!context.getBroker().isCaseSensitive())
            // strength = "primary";
            if (query == null) {
                return getCollationFromParams(null, strength, null);
            } else {
                String lang = null;
                String decomposition = null;
                final StringTokenizer queryTokenizer = new StringTokenizer(query,
                        ";&");
                while (queryTokenizer.hasMoreElements()) {
                    final String param = queryTokenizer.nextToken();
                    final int eq = param.indexOf('=');
                    if (eq > 0) {
                        final String kw = param.substring(0, eq);
                        String val = param.substring(eq + 1);
                        if ("lang".equals(kw)) {
                            lang = val;
                        } else if ("strength".equals(kw)) {
                            strength = val;
                        } else if ("decomposition".equals(kw)) {
                            decomposition = val;
                        }
                    }
                }
                return getCollationFromParams(lang, strength, decomposition);
            }
        } else if (uri.startsWith("java:")) {
            // java class specified: this should be a subclass of
            // java.text.RuleBasedCollator
            uri = uri.substring("java:".length());
            try {
                final Class<?> collatorClass = Class.forName(uri);
                if (!Collator.class.isAssignableFrom(collatorClass)) {
                    logger.error("The specified collator class is not a subclass of java.text.Collator");
                    throw new XPathException(
                            "The specified collator class is not a subclass of java.text.Collator");
                }
                return (Collator) collatorClass.newInstance();
            } catch (final Exception e) {
                logger.error("The specified collator class " + uri + " could not be found");
                throw new XPathException(
                        ErrorCodes.FOCH0002,
                        "The specified collator class " + uri + " could not be found", e);
            }
        } else if (CODEPOINT.equals(uri)) {
          return null;
        } else {
            logger.error("Unknown collation : '" + uri + "'");
            throw new XPathException(ErrorCodes.FOCH0002, "Unknown collation : '" + uri + "'");
        }
    }

    public final static boolean equals(Collator collator, String s1, String s2) {
        if (collator == null)
            {return s1.equals(s2);}
        else
            {return collator.equals(s1, s2);}
    }

    public final static int compare(Collator collator, String s1, String s2) {
        if (collator == null)
            {return s1==null ? (s2==null ? 0 : -1: s1.compareTo(s2);}
        else
            {return collator.compare(s1, s2);}
    }

    public final static boolean startsWith(Collator collator, String s1, String s2) {
        if (collator == null)
            {return s1.startsWith(s2);}
        else {
            final RuleBasedCollator rbc = (RuleBasedCollator) collator;
            final CollationElementIterator i1 = rbc.getCollationElementIterator(s1);
            final CollationElementIterator i2 = rbc.getCollationElementIterator(s2);
            return collationStartsWith(i1, i2);
        }
    }

    public final static boolean endsWith(Collator collator, String s1, String s2) {
        if (collator == null)
            {return s1.endsWith(s2);}
        else {
            final RuleBasedCollator rbc = (RuleBasedCollator) collator;
            final CollationElementIterator i1 = rbc.getCollationElementIterator(s1);
            final CollationElementIterator i2 = rbc.getCollationElementIterator(s2);
            return collationContains(i1, i2, null, true);
        }
    }

    public final static boolean contains(Collator collator, String s1, String s2) {
        if (collator == null)
            {return s1.indexOf(s2) != Constants.STRING_NOT_FOUND;}
        else {
            final RuleBasedCollator rbc = (RuleBasedCollator) collator;
            final CollationElementIterator i1 = rbc.getCollationElementIterator(s1);
            final CollationElementIterator i2 = rbc.getCollationElementIterator(s2);
            return collationContains(i1, i2, null, false);
        }
    }

    public final static int indexOf(Collator collator, String s1, String s2) {
        if (collator == null)
            {return s1.indexOf(s2);}
        else {
            final int offsets[] = new int[2];
            final RuleBasedCollator rbc = (RuleBasedCollator) collator;
            final CollationElementIterator i1 = rbc.getCollationElementIterator(s1);
            final CollationElementIterator i2 = rbc.getCollationElementIterator(s2);          
            if (collationContains(i1, i2, offsets, false))
                {return offsets[0];}
            else
                {return Constants.STRING_NOT_FOUND;}
        }
    }

    private final static boolean collationStartsWith(CollationElementIterator s0,
            CollationElementIterator s1) {
      //Copied from Saxon
        while (true) {
            int e0, e1;
            do {
                e1 = s1.next();
            } while (e1 == 0);
            if (e1 == -1) {
                return true;
            }
            do {
                e0 = s0.next();
            } while (e0 == 0);
            if (e0 != e1) {
                return false;
            }
        }
        //End of copy
    }

    private final static boolean collationContains(CollationElementIterator s0,
            CollationElementIterator s1, int[] offsets, boolean matchAtEnd) {
      //Copy from Saxon
        int e0, e1;
        do {
            e1 = s1.next();
        } while (e1 == 0);
        if (e1 == -1) {
            return true;
        }
        e0 = -1;
        while (true) {
            // scan the first string to find a matching character
            while (e0 != e1) {
                do {
                    e0 = s0.next();
                } while (e0 == 0);
                if (e0 == -1) {
                    // hit the end, no match
                    return false;
                }
            }
            // matched first character, note the position of the possible match
            final int start = s0.getOffset();
            if (collationStartsWith(s0, s1)) {
                if (matchAtEnd) {
                    do {
                        e0 = s0.next();
                    } while (e0 == 0);
                    if (e0 == -1) {
                        // the match is at the end
                        return true;
                    }
                    // else ignore this match and keep looking
                } else {
                    if (offsets != null) {
                        offsets[0] = start-1;
                        offsets[1] = s0.getOffset();
                    }
                    return true;
                }
            }
            // reset the position and try again
            s0.setOffset(start);

            // workaround for a difference between JDK 1.4.0 and JDK 1.4.1
            if (s0.getOffset() != start) {
                // JDK 1.4.0 takes this path
                s0.next();
            }
            s1.reset();
            e0 = -1;
            do {
                e1 = s1.next();
            } while (e1 == 0);
            // loop round to try again
        }
        //End of copy
    }

    /**
     * @param lang
     * @param strength
     * @param decomposition
     * @return The collator
     */
    private static Collator getCollationFromParams(String lang,
            String strength, String decomposition)
        throws XPathException {
        Collator collator = null;
        if (lang == null) {
            collator = Collator.getInstance();
        } else if ("sme-SE".equals(lang)) {
            // Collation rules contained in a String object.
            // Codes for the representation of names of languages:
            // http://www.loc.gov/standards/iso639-2/englangn.html
            // UTF-8 characters from:
            // http://chouette.info/entities/table-utf8.php
            final String Samisk = "< a,A< \u00E1,\u00C1< b,B< c,C"
                    + "< \u010d,\u010c< d,D< \u0111,\u0110< e,E"
                    + "< f,F< g,G< h,H< i,I< j,J< k,K< l,L< m,M"
                    + "< n,N< \u014b,\u014a< o,O< p,P< r,R< s,S"
                    + "< \u0161,\u0160< t,T< \u0167,\u0166< u,U"
                    + "< v,V< z,Z< \u017e,\u017d";
            try {
                collator = new RuleBasedCollator(Samisk);
            } catch (final ParseException pe) {
                return null;
            }
        } else {
            final Locale locale = getLocale(lang);
            collator = Collator.getInstance(locale);
        }

        if (strength != null) {
            if ("primary".equals(strength))
                {collator.setStrength(Collator.PRIMARY);}
            else if ("secondary".equals(strength))
                {collator.setStrength(Collator.SECONDARY);}
            else if ("tertiary".equals(strength))
                {collator.setStrength(Collator.TERTIARY);}
            else if (strength.length() == 0 || "identical".equals(strength))
                // the default setting
                {collator.setStrength(Collator.IDENTICAL);}
            else {
                logger.error("Collation strength should be either 'primary', 'secondary', 'tertiary' or 'identical");
                throw new XPathException(
                        "Collation strength should be either 'primary', 'secondary', 'tertiary' or 'identical");

            }

        }

        if (decomposition != null) {
            if ("none".equals(decomposition))
                {collator.setDecomposition(Collator.NO_DECOMPOSITION);}
            else if ("full".equals(decomposition))
                {collator.setDecomposition(Collator.FULL_DECOMPOSITION);}
            else if (decomposition.length() == 0
                    || "standard".equals(decomposition))
                // the default setting
                {collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);}
            else {
                logger.error("Collation decomposition should be either 'none', 'full' or 'standard");
                throw new XPathException(
                        "Collation decomposition should be either 'none', 'full' or 'standard");
            }

        }

        return collator;
    }

    /**
     * @param lang
     * @return The locale
     */
    private static Locale getLocale(String lang) {
        final int dashPos = lang.indexOf('-');
        if (dashPos == Constants.STRING_NOT_FOUND)
            {return new Locale(lang);}
        else
            {return new Locale(lang.substring(0, dashPos), lang.substring(dashPos + 1));}
    }

}
TOP

Related Classes of org.exist.util.Collations

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.