Package winterwell.utils

Source Code of winterwell.utils.StrUtils

/**
*
*/
package winterwell.utils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import winterwell.utils.containers.ArrayMap;
import winterwell.utils.containers.Pair;
import winterwell.utils.io.FileUtils;
import winterwell.utils.reporting.Log;
import winterwell.utils.reporting.Log.KErrorPolicy;
import winterwell.utils.web.WebUtils;

/**
* @see WebUtils for XML-related String handling
* @testedby {@link StrUtilsTest}
*/
public final class StrUtils {

  public static final String APOSTROPHES = "'`’‘’ʼ";

  /**
   * For use in {@link Collection#toArray(Object[])} to get String[] arrays.
   */
  public static final String[] ARRAY = new String[0];

  /**
   * Regex character set for ascii punctuation. A bit like \\W, but includes _
   * and doesn't include non-Latin characters (\\W barfs on foreign character
   * sets).
   */
  public static final Pattern ASCII_PUNCTUATION = Pattern
      .compile("[.<>,@~\\{\\}\\[\\]-_+=()*%?^$!\\\\/|¬:;#`'\"]");

  static final Pattern BLANK_LINE = Pattern.compile("^\\s+$",
      Pattern.MULTILINE);

  /**
   * Some commonly used bullet characters (for trying to spot lists, and
   * perhaps converting them into a standard format)
   */
  public static final String COMMON_BULLETS = "-*o";

  /**
   * Dash characters, including the humble -
   */
  private static final String DASHES = "‐‑‒–—―-";

  public static final String ENCODING_STD_ISO_LATIN = "ISO-8859-1";

  public static final String ENCODING_UTF8 = "UTF8";

  public static final String LINEEND = Utils.or(
      System.getProperty("line.separator"), "\n");

  public static final Pattern LINEENDINGS = Pattern.compile("(\r\n|\r|\n)");
  /** A paragraph of random latin, used for page layout testing since 1500s. */
  public static final String LOREM_IPSUM = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";

  public static final String QUOTES = "\"“”„‟❛❜❝❞«»";

  private static final double[] TENS = new double[20];

  static {
    TENS[0] = Math.pow(10, -6);
    for (int i = 1; i < TENS.length; i++) {
      TENS[i] = 10 * TENS[i - 1];
    }
  }

  /**
   * Convenience for peeking at a character which might be beyond the end of
   * the sequence.
   *
   * @param chars
   * @param i
   * @return the char at index i, if within range, or 0 otherwise.
   */
  public static char charAt(CharSequence chars, int i) {
    return i < chars.length() ? chars.charAt(i) : 0;
  }

  /**
   * Trim and compress all whitespace into single spaces. Also removes
   * whitespace between xml tags.
   *
   * @param txt
   *            Can be null (which will return null)
   * @return a string whose only whitespace is single spaces
   * @see #toCanonical(String)
   */
  public static String compactWhitespace(String txt) {
    if (txt == null)
      return null;
    txt = txt.trim();
    txt = txt.replaceAll("\\s+", " ");
    txt = txt.replaceAll("> <", "><");
    return txt;
  }

  /**
   * @param pageTitle
   * @param snippet
   * @return true if text contains snippet ignoring all capitalisation
   */
  public static boolean containsIgnoreCase(CharSequence pageTitle,
      String snippet) {
    // TODO more efficient -- avoid the copy(s)?
    String pt = pageTitle.toString().toLowerCase();
    return pt.contains(snippet.toLowerCase());
  }

  /**
   * Convert a block of text into a Java String constant
   *
   * @param args
   * @throws IOException
   */
  private static String convertToJavaString(String txt) {
    String[] lines = splitLines(txt);
    String jtxt = "";
    for (String line : lines) {
      line = line.replace("\\", "\\\\");
      line = line.replace("\"", "\\\"");
      jtxt += "+\"" + line + "\\n\"\n";
    }
    jtxt = jtxt.substring(1);
    return jtxt;
  }

  /**
   * Truncate a string adding ellipses if necessary. Does a simple test for
   * word boundary for more elegant start of ellipses
   *
   * @param input
   *            Can be null (returns null)
   * @return a string which is maxLength chars or less
   * @testedby {@link StrUtilsTest#testEllipsize()}
   */
  public static String ellipsize(String input, int maxLength) {
    if (input == null)
      return null;
    if (input.length() <= maxLength)
      return input;
    if (maxLength < 3)
      return "";
    if (maxLength == 3)
      return "...";
    // simple word boundary detection for nicer strings
    int i = input.lastIndexOf(' ', maxLength - 3);
    if (i < 1 || i < maxLength - 10) {
      i = maxLength - 3;
    }
    return substring(input, 0, i) + "...";
  }

  /**
   * Identical to {@link #newLine(StringBuilder)}.
   */
  public static void endLine(StringBuilder text) {
    newLine(text);
  }

  /**
   * Extract headers of the form: key: value
   *
   * Ended by a blank line
   *
   * @param txt
   *            Will be modified to the text minus the header
   * @return The headers. <i>Keys will be converted to lower-case</i> but can
   *         contain spaces.
   */
  public static Map<String, String> extractHeader(StringBuilder txt) {
    assert txt != null;
    String[] lines = StrUtils.splitLines(txt.toString());
    int cnt = 0;
    String key = null;
    StringBuilder value = new StringBuilder();
    Map<String, String> headers = new ArrayMap<String, String>();
    for (; cnt < lines.length; cnt++) {
      String line = lines[cnt];
      // End of header section?
      if (Utils.isBlank(line)) {
        break;
      }
      int i = line.indexOf(":");
      if (i == -1) {
        // Collect bits of a long value
        value.append(LINEEND);
        value.append(line);
        continue;
      }
      // Old key
      if (key != null) {
        headers.put(key, value.toString());
      }
      // New key
      value = new StringBuilder();
      key = line.substring(0, i).toLowerCase();
      i++;
      if (i == line.length()) {
        continue;
      }
      if (line.charAt(i) == ' ') {
        i++;
      }
      if (i == line.length()) {
        continue;
      }
      value.append(line.substring(i));
    }
    // Final key-value pair
    if (key != null) {
      headers.put(key, value.toString());
    }
    // Strip off header
    if (headers.size() == 0)
      return headers;
    Pattern blankLine = Pattern.compile("^\\s*$", Pattern.MULTILINE);
    Matcher m = blankLine.matcher(txt);
    boolean ok = m.find();
    if (ok) {
      txt.delete(0, m.end());
      if (txt.length() != 0) {
        if (txt.charAt(0) == '\r' && txt.charAt(1) == '\n') {
          txt.delete(0, 2);
        } else {
          txt.delete(0, 1);
        }
      }
    }
    return headers;
  }

  /**
   * Convenience for using regexs. Find the first instance of pattern in
   * input.
   *
   * @param pattern
   * @param input
   * @return the matched groups (0 is the whole match), or null
   */
  public static String[] find(Pattern pattern, String input) {
    Matcher m = pattern.matcher(input);
    boolean fnd = m.find();
    if (!fnd)
      return null;
    int n = m.groupCount() + 1;
    String[] grps = new String[n];
    grps[0] = m.group();
    for (int i = 1; i < n; i++) {
      grps[i] = m.group(i);
    }
    return grps;
  }

  //
  // public static Map<String,String> getStrings(InputStream in) {
  // String txt = FileUtils.read(in);
  // String[] lines = splitLines(txt);
  // Map<String, String> map = new HashMap<String, String>();
  // return map;
  // }

  /**
   * Convenience method for {@link #find(Pattern, String)}
   *
   * @param regex
   * @param string
   * @return the matched groups (0 is the whole match), or null
   */
  public static String[] find(String regex, String string) {
    return find(Pattern.compile(regex), string);
  }

  /**
   * Find the position of content within text -- ignoring whitespace and
   * unicode issues.
   *
   * @param content
   * @param text
   * @param start
   *            Go from here. 0 is the normal value.
   * @return [start, end) position, or null if not found
   */
  public static Pair<Integer> findLenient(String content, String text,
      int start) {
    // Note: these edits do not change the offsets
    content = StrUtils.normalise(content, KErrorPolicy.RETURN_NULL);
    text = StrUtils.normalise(text, KErrorPolicy.RETURN_NULL);
    content = content.toLowerCase();
    text = text.toLowerCase();

    // regex escape (can't use Pattern.quote() 'cos we do want flexible
    // whitespace)
    String regex = content.replace("\\", "\\\\");
    String SPECIAL = "()[]{}$^.*+?";
    for (int i = 0; i < SPECIAL.length(); i++) {
      char c = SPECIAL.charAt(i);
      regex = regex.replace("" + c, "\\" + c);
    }

    // turn whitespace in the quote into a whitespace pattern
    regex = regex.replaceAll("\\s+", "\\\\s+");
    // find it
    Pattern p = Pattern.compile(regex);
    Matcher m = p.matcher(text);
    if (m.find(start))
      return new Pair(m.start(), m.end());
    return null;
  }

  public static String getFirstName(String name) {
    name = name.trim();
    assert !name.contains("\n") : name;
    String[] nameBits = name.split("[ \t\\.,]+");
    String firstName = nameBits[0];
    // don't send a card "Dear Dr"
    firstName = StrUtils.toTitleCase(firstName);
    List<String> titles = Arrays.asList("Mr", "Mrs", "Ms", "Dr", "Doctor",
        "Prof", "Professor", "Sir", "Director");
    if (titles.contains(firstName)) {
      firstName = nameBits[1];
      firstName = StrUtils.toTitleCase(firstName);
    }
    return firstName;
  }

  /**
   * Inverse of {@link #extractHeader(StringBuilder)}. Not very robust
   *
   * @param header
   * @return
   */
  public static String getHeaderString(Map header) {
    StringBuilder sb = new StringBuilder();
    for (Object k : header.keySet()) {
      String ks = k.toString().trim().toLowerCase();
      String vs = header.get(k).toString();
      sb.append(ks + ": " + vs + StrUtils.LINEEND);
    }
    return sb.toString();
  }

  /**
   * @param text
   * @return indexes for the first character of each line
   */
  public static int[] getLineStarts(String text) {
    List<Integer> starts = new ArrayList<Integer>();
    for (int i = 0; i < text.length(); i++) {
      char c = text.charAt(i);
      // windows or linux style linebreak
      if (c == '\n') {
        starts.add(i);
      }
      if (c == '\r') {
        int ni = i + 1;
        if (ni == text.length() || text.charAt(ni) != '\n') {
          // Mac style linebreak
          starts.add(i);
        }
      }
    }
    return MathUtils.toIntArray(starts);
  }

  private static String hash(String hashAlgorithm, String txt) {
    try {
      java.security.MessageDigest md = java.security.MessageDigest
          .getInstance(hashAlgorithm);
      StringBuffer result = new StringBuffer();
      try {
        for (byte b : md.digest(txt.getBytes(ENCODING_UTF8))) {
          result.append(Integer.toHexString((b & 0xf0) >>> 4));
          result.append(Integer.toHexString(b & 0x0f));
        }
      } catch (UnsupportedEncodingException e) {
        for (byte b : md.digest(txt.getBytes())) {
          result.append(Integer.toHexString((b & 0xf0) >>> 4));
          result.append(Integer.toHexString(b & 0x0f));
        }
      }
      return result.toString();
    } catch (java.security.NoSuchAlgorithmException ex) {
      throw new RuntimeException(ex);
    }
  }

  /**
   * Convenience for a surprisingly common case. Use this to guard statements
   * like <code>Long.valueOf(String)</code>.
   *
   * @param possNumber
   * @return true for e.g. "123"
   */
  public static boolean isJustDigits(String possNumber) {
    for (int i = 0; i < possNumber.length(); i++) {
      if (!Character.isDigit(possNumber.charAt(i)))
        return false;
    }
    return true;
  }

  /**
   * @param x
   *            Can be null (returns false)
   * @return true if x is in fact a number
   *
   *         ??Should we support non-standard formats such as "1,000", "10k"?
   *         Not here!
   */
  public static boolean isNumber(String x) {
    if (x == null)
      return false;
    try {
      // should we use a regex instead? \\d+(\\.\\d+)?
      Double.valueOf(x);
      return true;
    } catch (Exception e) {
      return false;
    }
  }

  /**
   *
   * @param txt
   * @param c
   * @return true if txt only consists of the given character, e.g.
   *         "--------". false for null and ""
   */
  public static boolean isOnly(String txt, char c) {
    if (txt == null || txt.length() == 0)
      return false;
    for (int i = 0; i < txt.length(); i++) {
      if (txt.charAt(i) != c)
        return false;
    }
    return true;
  }

  /**
   * @param txt
   * @return true if txt represents a single word (with no whitespace). Just a
   *         convenience for using the \w regex.
   */
  public static boolean isWord(String txt) {
    return txt.matches("\\w+");
  }

  /**
   * Return a string that is the string representation of the elements of list
   * separated by separator
   * <p>
   * Identical to {@link Printer#toString(Collection, String)}
   *
   * @see #join(StringBuilder, Collection, String)
   */
  public static <T> String join(Collection<T> list, String separator) {
    return Printer.toString(list, separator);
  }

  /**
   * Convenience for the case where the list has start & end markers. Return a
   * string that is start + the string representation of the elements of list
   * separated by separator + end. E.g. "(", [1,2,], " or ", ")" => "(1 or 2)"
   */
  public static <T> StringBuilder join(String start, Collection<T> list,
      String separator, String end) {
    StringBuilder sb = new StringBuilder(start);
    if (!list.isEmpty()) {
      for (T t : list) {
        if (t == null) {
          continue;
        }
        sb.append(Printer.toString(t));
        sb.append(separator);
      }
      if (sb.length() != 0) {
        pop(sb, separator.length());
      }
    }
    sb.append(end);
    return sb;
  }

  public static String join(String[] array, String separator) {
    if (array.length == 0)
      return "";
    StringBuilder sb = new StringBuilder();
    for (String string : array) {
      if (string == null) {
        continue;
      }
      sb.append(string);
      sb.append(separator);
    }
    if (sb.length() != 0) {
      pop(sb, separator.length());
    }
    return sb.toString();
  }

  /**
   * Append the string representation of the elements of list separated by
   * separator.
   * <p>
   * Identical to {@link Printer#append(StringBuilder, Collection, String)}
   *
   * @see #join(Collection, String)
   */
  public static <T> void join(StringBuilder sb, Collection<T> list,
      String separator) {
    Printer.append(sb, list, separator);
  }

  /**
   * Convert a block of text (read from the console) into a Java String
   * constant
   *
   * @param args
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    String txt = "";
    BufferedReader in = FileUtils.getReader(System.in);
    while (true) {
      String line = in.readLine();
      if (line.equals("EXIT") || line.equals("QUIT")) {
        break;
      }
      txt += line + "\n";
    }
    String jtxt = convertToJavaString(txt);
    // WebUtils.stripTags(txt);
    System.out.println(jtxt);
  }

  /**
   * TODO a sha1 alternative
   *
   * @param txt
   * @return MD5 hash of txt
   */
  public static String md5(String txt) {
    return hash("MD5", txt);
  }

  /**
   * Ensure that the string builder is pointing at a new line. I.e. if the
   * last character is a line-end, this does nothing. if the last character is
   * not a line-end, this adds a line end.
   *
   * @param text
   */
  public static void newLine(StringBuilder text) {
    if (text.length() == 0)
      return;
    char last = text.charAt(text.length() - 1);
    if (last == '\r' || last == '\n')
      return;
    text.append(LINEEND);
  }

  public static String normalise(String unicode) {
    return normalise(unicode, KErrorPolicy.RETURN_NULL);
  }

  /**
   * Convert unicode text into a normalised Ascii form -- if we can. E.g.
   * strip out umlauts.
   *
   * @param unicode
   * @param onUnrecognisableChar
   *            If we cannot recognise a character, what to do?
   *            KErrorPolicy.IGNORE will skip over un-normalised chars.
   *            KErrorPolicy.ACCEPT will keep the un-normalised char.
   *            KErrorPolicy.RETURN_NULL will substitute ? for unrecognised
   *            chars.
   * @return ascii text
   * @testedby {@link StrUtilsTest#testNormalise()}
   */
  public static String normalise(String unicode,
      Log.KErrorPolicy onUnrecognisableChar)
      throws IllegalArgumentException {
    // all ascii anyway?
    boolean ascii = true;
    for (int i = 0, n = unicode.length(); i < n; i++) {
      char c = unicode.charAt(i);
      if (c > 127 || c == 0) {
        ascii = false;
        break;
      }
    }
    if (ascii)
      return unicode;
    // alternatively, we could use a lookup table
    // c.f. http://www.rgagnon.com/javadetails/java-0456.html which uses 2
    // aligned strings for lookup
    String normed = Normalizer.normalize(unicode, Normalizer.Form.NFD);
    StringBuilder clean = new StringBuilder(normed.length());
    for (int i = 0, n = normed.length(); i < n; i++) {
      char c = normed.charAt(i);
      if (APOSTROPHES.indexOf(c) != -1) {
        clean.append('\'');
        continue;
      }
      if (QUOTES.indexOf(c) != -1) {
        clean.append('"');
        continue;
      }
      if (DASHES.indexOf(c) != -1) {
        clean.append('-');
        continue;
      }
      // ascii?
      // NB: filter out any bogus 0 chars (rarely needed, but a useful
      // safety measure)
      if (c < 128 && c != 0) {
        clean.append(c);
        continue;
      }
      // boolean hs = Character.isHighSurrogate(c);
      // boolean ls = Character.isLowSurrogate(c);
      // boolean vcp = Character.isValidCodePoint(c);
      // boolean scp = Character.isSupplementaryCodePoint(c);
      if (!Character.isLetter(c)) {
        // // ignore non-letters, e.g. umlauts
        // Unfortunately this also swallows non-standard punctuation
        continue;
      }
      switch (onUnrecognisableChar) {
      case DIE:
      case THROW_EXCEPTION:
        throw new FailureException(unicode);
      case IGNORE:
        continue;
      case ACCEPT:
        // filter out any bogus 0 chars (rarely needed, but a useful
        // safety measure)
        clean.append(c == 0 ? ' ' : c);
        break;
      case RETURN_NULL:
        clean.append('?');
        break;
      case REPORT:
        Log.report("Could not normalise to ascii: " + unicode);
        // ignore
      }
    }
    // if ((onUnrecognisableChar == KErrorPolicy.ACCEPT ||
    // onUnrecognisableChar == KErrorPolicy.RETURN_NULL)
    // && clean.length() < unicode.length() / 2) {
    // throw new IllegalArgumentException(unicode +" to "+clean);
    // }
    return clean.toString();
  }

  /**
   * Delete a number of chars from the ends of a {@link StringBuilder}.
   *
   * @param sb
   * @param chars
   */
  public static void pop(StringBuilder sb, int chars) {
    sb.delete(sb.length() - chars, sb.length());
  }

  /**
   * Replace regex with null, collecting the cuttings.
   *
   * @param string
   * @param regex
   * @param removed
   *            The cut bits of text will be added to this
   * @return string after a s/regex/""/ op
   */
  public static String remove(String string, String regex,
      final Collection removed) {
    String s2 = replace(string, Pattern.compile(regex), new IReplace() {
      @Override
      public void appendReplacementTo(StringBuilder sb, Matcher match) {
        removed.add(match.group());
        return;
      }
    });
    return s2;
  }

  /**
   * Repeat a character
   *
   * @param c
   * @param n
   * @return e.g. '-',5 creates "-----"
   */
  public static String repeat(char c, int n) {
    char[] chars = new char[n];
    Arrays.fill(chars, c);
    return new String(chars);
  }

  /**
   * Like a monkey with a miniature symbol. The joy of repetition really is in
   * you. ?? Does this deserve to be a utility method?
   *
   * @param string
   * @param n
   * @return stringstringstring...
   */
  public static String repeat(String string, int n) {
    StringBuilder sb = new StringBuilder(string.length() * n);
    for (int i = 0; i < n; i++) {
      sb.append(string);
    }
    return sb.toString();
  }

  /**
   * Use a regex, calling out to a function to compute the replacements.
   *
   * @param string
   * @param regex
   * @param replace
   *            Determines what replacements to make
   * @return string after all matches of regex have been replaced.
   */
  public static String replace(String string, Pattern regex, IReplace replace) {
    Matcher m = regex.matcher(string);
    StringBuilder sb = new StringBuilder(string.length() + 16);
    int pos = 0;
    while (m.find()) {
      sb.append(string.substring(pos, m.start()));
      replace.appendReplacementTo(sb, m);
      pos = m.end();
    }
    sb.append(string.substring(pos, string.length()));
    return sb.toString();
  }

  /**
   * Get a StringBuilder from what might be a String (avoid a copy if we can)
   *
   * @param charSeq
   * @return a string builder, which will be the input if it already is one.
   */
  public static StringBuilder sb(CharSequence charSeq) {
    return charSeq instanceof StringBuilder ? (StringBuilder) charSeq
        : new StringBuilder(charSeq);
  }

  /**
   * A slightly smarter version of split on whitespace, or at least,
   * different. Splits on whitespace or commas, and supports quoting (but you
   * can't actually use quotes at all within a token). E.g. tag1, "tag 2"
   * tag-3
   *
   * @param line
   *            Can be null
   * @return May be empty if the input is blank
   *
   * @see
   * @testedby {@link StrUtilsTest#split()}
   */
  public static List<String> split(String line) {
    if (line == null || line.length() == 0)
      return Collections.emptyList();
    ArrayList<String> row = new ArrayList<String>();
    StringBuilder field = new StringBuilder();
    char quote = '"';
    boolean inQuotes = false;
    for (int i = 0, n = line.length(); i < n; i++) {
      char c = line.charAt(i);
      if (c == quote) {
        inQuotes = !inQuotes;
        continue;
      }
      if (inQuotes) {
        // just add it
        field.append(c);
        continue;
      }
      if (Character.isWhitespace(c) || c == ',') {
        if (field.length() == 0) {
          continue;
        }
        // Finished a tag
        row.add(field.toString());
        field = new StringBuilder();
        continue;
      }
      // just add it
      field.append(c);
    }
    // Add last field
    if (field.length() == 0)
      return row;
    String f = field.toString();
    row.add(f);
    return row;
  }

  /**
   * Split into paragraph blocks by looking for an empty line.
   *
   * @param message
   * @return
   * @testedby {@link StrUtilsTest#testSplitBlocks()}
   */
  public static String[] splitBlocks(String message) {
    return message.split("\\s*\r?\n\\s*\r?\n"); // TODO a better regex
  }

  /**
   * Split a string in half at the first instance of c. E.g.
   * splitFirst("A:B",':') == ("A","B")
   *
   * @param line
   * @param c
   * @return the line upto c, and after it. Neither part includes c. Or null
   *         if c could not be found.
   */
  public static Pair<String> splitFirst(String line, char c) {
    int i = line.indexOf(c);
    if (i == -1)
      return null;
    String end = i == line.length() ? "" : line.substring(i + 1);
    return new Pair<String>(line.substring(0, i), end);
  }

  /**
   * @param txt
   * @return txt split into lines. The String values do not include line
   *         endings. This is just a convenience for a regex with
   *         cross-platform line-endings. Note that trailing empty lines will
   *         be discarded.
   */
  public static String[] splitLines(String txt) {
    return LINEENDINGS.split(txt);
  }

  /**
   * A more flexible (and dangerous) version of substring.
   *
   * @param string
   *            Can be null, in which case null will be returned
   * @param start
   *            Inclusive. Can be negative for distance from the end.
   * @param end
   *            Exclusive. Can be negative for distance from the end. E.g. -1
   *            indicates "all but the last character" (zero indicates
   *            "up to the end"). Can be longer than the actual string, in
   *            which case it is reduced. If end is negative and too large, an
   *            empty string will be returned.
   * @return The chopped string. null if the input was null. The empty string
   *         if the range was invalid.
   */
  public static String substring(String string, int start, int end) {
    if (string == null)
      return null;
    int len = string.length();
    // start from end?
    if (start < 0) {
      start = len + start;
      if (start < 0) {
        start = 0;
      }
    }
    // from end?
    if (end <= 0) {
      end = len + end;
      if (end < start)
        return "";
    }
    // too long?
    if (end > len) {
      end = len;
    }
    // OK
    if (start == 0 && end == len)
      return string;
    return string.substring(start, end);
  }

  /**
   * Perform common "canonicalisation" operations. Often strings are
   * equivalent if they only differ in case, punctuation, or spacing.
   *
   * @param string
   *            Can be null (returns "")
   * @return compact "canonical" version of string: lowercased, compact
   *         whitespace & trim, normalised (no accents), and all punctuation
   *         is converted into spaces.
   *
   *         TODO should this go further & strip all " ", "-" and "_" chars??
   * @testedby {@link StrUtilsTest#testToCanonical()}
   */
  public static String toCanonical(String string) {
    if (string == null)
      return "";
    StringBuilder sb = new StringBuilder();
    boolean spaced = false;
    for (int i = 0, n = string.length(); i < n; i++) {
      char c = string.charAt(i);
      // lowercase letters
      if (Character.isLetterOrDigit(c)) {
        spaced = false;
        // Note: javadoc recommends String.toLowerCase() as being better
        // -- I wonder if it actually is, or if this is aspirational
        // internationalisation?
        c = Character.toLowerCase(c);
        sb.append(c);
        continue;
      }
      // all else as spaces
      // compact whitespace
      // if (Character.isWhitespace(c)) {
      if (spaced || sb.length() == 0) {
        continue;
      }
      sb.append(' ');
      spaced = true;
      // }
      // ignore punctuation!
    }
    if (spaced) {
      pop(sb, 1);
    }
    string = sb.toString();
    // ditch the accents, is we can
    return normalise(string, KErrorPolicy.ACCEPT);
  }

  /**
   * Convert all line-endings to \n. Convert all blank lines to being empty
   * lines.
   *
   * @param text
   * @return
   */
  public static String toCleanLinux(String text) {
    text = text.replace("\r\n", "\n");
    text = text.replace('\r', '\n');
    text = BLANK_LINE.matcher(text).replaceAll("");
    return text;
  }

  /**
   *
   * @param "Hello world"
   * @return "HW"
   */
  public static String toInitials(String name) {
    StringBuilder sb = new StringBuilder();
    boolean yes = true;
    for (int i = 0; i < name.length(); i++) {
      char c = name.charAt(i);
      if (Character.isWhitespace(c)) {
        yes = true;
        continue;
      }
      if (yes) {
        c = Character.toUpperCase(c);
        sb.append(c);
      }
      yes = false;
    }
    return sb.toString();
  }

  /**
   * @param x
   * @param n
   * @return
   * @testedby {@link StrUtilsTest#testToNSigFigs()}
   */
  public static String toNSigFigs(double x, int n) {
    assert n > 0;
    String sign = x < 0 ? "-" : "";
    double v = Math.abs(x);
    double lv = Math.floor(Math.log10(v));
    double keeper = Math.pow(10, n - 1);
    double tens = Math.pow(10, lv);
    int keepMe = (int) Math.round(v * keeper / tens);
    // avoid scientific notation for fairly small decimals
    if (lv < 0) {
      String s = toNSigFigs2_small(n, sign, lv, keepMe);
      if (s != null)
        return s;
    }
    double vt = keepMe * tens / keeper;
    String num = Printer.toStringNumber(vt);
    return sign + num;
  }

  private static String toNSigFigs2_small(int n, String sign, double lv,
      int keepMe) {
    // use scientific notation for very small
    if (lv < -8)
      return null;
    StringBuilder sb = new StringBuilder(sign);
    int zs = (int) -lv;
    String sKeepMe = Integer.toString(keepMe);
    if (sKeepMe.length() > n) {
      assert sKeepMe.charAt(sKeepMe.length() - 1) == '0';
      // we've rounded up from 9 to 10, so lose a decimal place
      zs--;
      sKeepMe = sKeepMe.substring(0, sKeepMe.length() - 1);
      if (zs == 0)
        return null;
    }
    sb.append("0.");
    for (int i = 1; i < zs; i++) {
      sb.append('0');
    }
    sb.append(sKeepMe);
    return sb.toString();
  }

  /**
   * Converts words and multiple words to lowercase and Uppercase on first
   * letter only E.g. daniel to Daniel, the monkeys to The Monkeys, BOB to
   * Bob. Allows apostrophes to be in words. Handles multiple words.
   *
   * @testedby {@link StrUtilsTest#testToTitleCase1()}
   */
  public static String toTitleCase(String title) {
    if (title.length() < 2)
      return title.toUpperCase();
    StringBuilder sb = new StringBuilder(title.length());
    boolean goUp = true;
    for (int i = 0, n = title.length(); i < n; i++) {
      char c = title.charAt(i);
      if (Character.isLetterOrDigit(c) || c == '\'') {
        if (goUp) {
          sb.append(Character.toUpperCase(c));
          goUp = false;
        } else {
          sb.append(Character.toLowerCase(c));
        }
      } else {
        sb.append(c);
        goUp = true;
      }
    }
    return sb.toString();
  }

  /**
   * Handles multiple words, camel-case (*if* the first letter is lower-case),
   * and _. White space is not preserved.
   *
   * @param wouldBeTitle
   * @return E.g. "Spoon Mc Guffin" from "spoonMcGuffin", or "Spoon Mcguffin"
   *         from "spoon mcguffin" Hm... could move to NLP where it could also
   *         know about stop words
   *
   * @testedby {@link StrUtilsTest#testToTitleCasePlus}
   */
  public static String toTitleCasePlus(String wouldBeTitle) {
    String[] words = wouldBeTitle.split("(_|\\s+)");
    StringBuilder sb = new StringBuilder();
    for (String word : words) {
      if (word.length() == 0) {
        continue;
      }
      // camelCase - is only if starts lower case to avoid mangling
      // McSwinney type names.
      if (Character.isUpperCase(word.charAt(0))) {
        sb.append(word);
        sb.append(' ');
        continue;
      }
      word = replace(word, Pattern.compile("[A-Z]?[^A-Z]+"),
          new IReplace() {
            @Override
            public void appendReplacementTo(StringBuilder sb2,
                Matcher match) {
              String w = match.group();
              w = toTitleCase(w);
              sb2.append(w);
              sb2.append(' ');
            }
          });
      sb.append(word);
    }
    if (sb.length() != 0) {
      pop(sb, 1);
    }
    return sb.toString();
  }

  /**
   * TODO trim out whitespace and punctuation from the beginning and end of
   * the string.
   *
   * @param string
   * @return
   */
  public static String trimPunctuation(String string) {
    // TODO Auto-generated method stub
    return string;
  }

  /**
   * Trims a wrapping pair of ''s or ""s. ??does nothing with whistepsace or
   * if the string has just a leading/trailing '/"
   *
   * @param string
   * @return string, or string less the wrapping quotes
   */
  public static String trimQuotes(String string) {
    if (string.charAt(0) != '\'' && string.charAt(0) != '\"')
      return string;
    char c = string.charAt(string.length() - 1);
    if (c != '\'' && c != '\"')
      return string;
    return string.substring(1, string.length() - 1);
  }

  /**
   * Simple whitespace based word counter.
   *
   * @param text
   * @return
   */
  public static int wordCount(String text) {
    return text.split("\\s+").length;
  }

  /**
   * This is a static class, but for those users who need an object...
   */
  public StrUtils() {
  }

}
TOP

Related Classes of winterwell.utils.StrUtils

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.