Package org.cmc.music.clean

Source Code of org.cmc.music.clean.MetadataCleanup$RegExCache

/*
* Written By Charles M. Chen
*
* Created on Jan 1, 2006
*
*/

package org.cmc.music.clean;

import java.util.Arrays;
import java.util.Hashtable;
import java.util.Map;
import java.util.Vector;

import org.apache.regexp.RE;
import org.cmc.music.metadata.IMusicMetadata;
import org.cmc.music.metadata.MusicMetadataConstants;
import org.cmc.music.util.Debug;
import org.cmc.music.util.MyComparator;
import org.cmc.music.util.MyMap;
import org.cmc.music.util.TextUtils;

public class MetadataCleanup implements MusicMetadataConstants
{

  private static final String DEFAULTS[] = { "album", //
      "artist", //
      "title", //
      "no title", //
      "no artist", //
      "undefined", //
      "va", //
      "mp3", //
      "cd", //
      "genre", //
      "unknown", //
      "name", //
      "n/a", //
      "Untitled", //
  };
  static
  {
    Arrays.sort(DEFAULTS, MyComparator.kToStringLengthReverse);
  }
  private static final String DEFAULTS_VA[] = { "Compilation", //
      "V.A", //
      "V.A.", //
      "V. A.", //
      "V. A", //
      "V/A", //
      "Va", //
      "V A", //
      "Various Artists", //
      "Various", //
      "Varioius", //
      "Varied Artists", //
      "Varias", //
      "Varios Interpretes", //
      "Varios", //
      "Various Artist", //
      "Various Artistses", //
      "Various Artits", //
      "Various Artisis", //
      "Various Aritsts", //
      "Varius Artists", //
      "Various Composers", //
      "Various djs", //
  };
  static
  {
    Arrays.sort(DEFAULTS_VA, MyComparator.kToStringLengthReverse);
  }
  private static final String DEFAULTS_SOUNDTRACK[] = {
      "The Motion Picture".toLowerCase(), //
      "Motion Picture".toLowerCase(), //
      "Original Motion Picture".toLowerCase(), //
      "Original Motion Picture Soundtrack".toLowerCase(), //
      "The Soundtrack".toLowerCase(), //
      "Music From The Motion Picture".toLowerCase(), //
      "Original Soundtrack Recording".toLowerCase(), //
      "Trilha Sonora Original".toLowerCase(), //
      "ost", //
      "original soundtrack", //
      "soundtrack", //
      "Music From The Motion Picture Soundtrack", //
  };
  static
  {
    Arrays.sort(DEFAULTS_SOUNDTRACK, MyComparator.kToStringLengthReverse);
  }
  private static final String DEFAULT_ACAPELLA = "Ac+ap+el+as?".toLowerCase();

  public String rectifyGeneric(String s)
  {
    return rectifyGeneric(s, null);
  }

  public String rectifyGeneric(String s, IMusicMetadata flags)
  {
    String old = s;
    while (true)
    {
      s = rectifyGeneric_1(s, flags);
      s = removeQuotes(s);
      if (s == null)
        return null;
      if (s.equals(old))
        return s;
      old = s;
    }
  }

  private String removeQuotes(String s)
  {
    if (s == null)
      return null;

    if (new RE("^\".+\"$").match(s) || new RE("^'.+'$").match(s)
        || new RE("^\\{.+\\}$").match(s)
        || new RE("^\\(.+\\)$").match(s) || new RE("^<.+>$").match(s)
        || new RE("^\\[.+\\]$").match(s))
    {
      s = s.substring(1, s.length() - 1);
    }

    return s;
  }

  private static final String ROMAN_NUMERALS = "ivx";

  private boolean isRomanNumeral(String s)
  {
    char chars[] = s.toCharArray();
    for (int i = 0; i < chars.length; i++)
    {
      char c = chars[i];
      if (ROMAN_NUMERALS.indexOf(c) < 0
          && ROMAN_NUMERALS.toUpperCase().indexOf(c) < 0)
        return false;
    }
    return true;
  }

  private static final Map NATURAL_NUMBERS = new MyMap();
  static
  {
    NATURAL_NUMBERS.put("zero", new Integer(0));
    NATURAL_NUMBERS.put("one", new Integer(1));
    NATURAL_NUMBERS.put("two", new Integer(2));
    NATURAL_NUMBERS.put("three", new Integer(3));
    NATURAL_NUMBERS.put("four", new Integer(4));
    NATURAL_NUMBERS.put("five", new Integer(5));
    NATURAL_NUMBERS.put("six", new Integer(6));
    NATURAL_NUMBERS.put("seven", new Integer(7));
    NATURAL_NUMBERS.put("eight", new Integer(8));
    NATURAL_NUMBERS.put("nine", new Integer(9));
    NATURAL_NUMBERS.put("ten", new Integer(10));
    NATURAL_NUMBERS.put("eleven", new Integer(11));
    NATURAL_NUMBERS.put("twelve", new Integer(12));
    NATURAL_NUMBERS.put("thirteen", new Integer(13));
    NATURAL_NUMBERS.put("fourteen", new Integer(14));
    NATURAL_NUMBERS.put("fifteen", new Integer(15));
    NATURAL_NUMBERS.put("sixteen", new Integer(16));
    NATURAL_NUMBERS.put("seventeen", new Integer(17));
    NATURAL_NUMBERS.put("eighteen", new Integer(18));
    NATURAL_NUMBERS.put("nineteen", new Integer(19));
    NATURAL_NUMBERS.put("twenty", new Integer(20));
  }

  private Number parseNumber(String s)
  {
    if (s == null)
      return null;
    s = s.trim();
    if (s.length() < 1)
      return null;

    try
    {
      return Integer.valueOf(s.trim());
    } catch (Throwable e)
    {
      // Debug.debug(s, e.getMessage());
    }
    Number value = (Number) NATURAL_NUMBERS.get(s.toLowerCase());
    return value;
  }

  private String clean(String s, IMusicMetadata flags)
  {
    // String old_s = s;
    s = s.trim();

    s = Diacriticals.convertDiacriticals(s);

    while (s.startsWith("-"))
      s = s.substring(1);

    s = removeSafePrefixSuffix(s, DEFAULTS);
    if (s == null)
      return null;

    {
      // Debug.debug("!x! considering disc: '" + s + "'");

      final String DISC_REGEXS[] = {
          "[-\\(\\[] ?dis[ck] ?([a-zA-Z\\d]+)[\\)\\]]?$", //
          "[-\\(\\[] ?cd ?([a-zA-Z\\d]+)[\\)\\]]?$", //
          "^[\\(\\[]?dis[ck] ?([a-zA-Z\\d]+) ?[-\\)\\]]", //
          "^[\\(\\[]?cd ?([a-zA-Z\\d]+) ?[-\\)\\]]", //
          "^dis[ck] ?([a-zA-Z\\d]+)$", //
          "^cd[\\. \\-]*([a-zA-Z\\d]+)$", //
      };

      for (int i = 0; i < DISC_REGEXS.length; i++)
      {
        String kDISC_REGEX = DISC_REGEXS[i];

        RE re = new RE(kDISC_REGEX);
        if (re.match(s.toLowerCase()))
        {

          if (re.getParenCount() < 2)
          {
            Debug.debug("Disc missing number", s);
            Debug.dumpStack(3);
            continue;
          }
          int start = re.getParenStart(0);
          int end = re.getParenEnd(0);
          String value = re.getParen(1);
          // Debug.debug("value", value);
          Number number = parseNumber(value);
          if (number == null)
          {
            Debug.debug("Disc missing value", value);
            Debug.dumpStack(3);
            continue;
          }
          // Debug.debug("number", number);

          String fixed = s.substring(0, start) + s.substring(end);
          // Debug.debug("fixed", fixed);
          // Debug.debug("start", start);
          // Debug.debug("end", end);
          if (flags != null)
            flags.setDiscNumber(number);
          s = fixed.trim();
          // return null;
        }
      }

    }

    String suffixes[] = { " ", //
        "-", //
        ".Mp3", //
        " Mp3", //
    };
    s = removeSuffixes(s, suffixes);
    String prefixes[] = { " ", //
        "-", //
    };
    s = removePrefixes(s, prefixes);

    // {
    // final String suffixes[] = {
    // " ", ".", "-", "Gabba Cc", "GABBA_CC", "G A B B A C C",
    // "mp3-link", "Mp3 - Link", ".mp3",
    // };
    //
    // s = this.removeSuffixes(s, suffixes);
    // }

    // Debug.debug("clean 2", s);

    s = s.replace('_', ' ');
    s = TextUtils.replace(s, "-", " - ");
    s = TextUtils.replace(s, "`", "'");
    s = TextUtils.replace(s, "�", "'");
    s = TextUtils.replace(s, "�", "'");
    s = TextUtils.replace(s, "[", "(");
    s = TextUtils.replace(s, "]", ")");
    s = TextUtils.replace(s, "(", " (");
    s = TextUtils.replace(s, "~", "-");
    s = TextUtils.replace(s, "  ", " ");
    s = TextUtils.replace(s, "  ", " ");
    s = TextUtils.replace(s, "..", ".");
    s = TextUtils.replace(s, "--", "-");
    s = TextUtils.replace(s, "- -", "-");

    s = TextUtils.replace(s, "#", "No. ");

    {
      String old = s;

      // Debug.debug("s1", s);
      s = removeSafePrefixSuffix(s, DEFAULTS_SOUNDTRACK, true);
      // Debug.debug("s2", s);

      if (s == null || !s.equals(old))
      {
        if (flags != null)
          flags.setIsSoundtrack(Boolean.TRUE);
      }
      if (s == null)
        return null;
    }

    {
      String temp = removeSafePrefixSuffix(s, DEFAULT_ACAPELLA, true);
      // String temp = removeSafePrefixSuffix(s, kDEFAULTS_acapella);
      // Debug.debug("s2", s);

      if (temp == null || !s.equals(temp))
      {
        if (flags != null)
          flags.setIsAcapella(Boolean.TRUE);
      }
    }

    // Debug.debug("clean 3", s);

    {
      String old = s;

      s = removeSafePrefixSuffix(s, DEFAULTS_VA, true);

      if (s == null || !s.equals(old))
      {
        if (flags != null)
          flags.setIsCompilation(Boolean.TRUE);
      }
      if (s == null)
        return null;
    }

    {
      String splits[] = TextUtils.split(s, " ");
      for (int i = 0; i < splits.length; i++)
      {
        if (isRomanNumeral(splits[i]))
          splits[i] = splits[i].toUpperCase();
      }
      s = TextUtils.join(splits, " ");
    }

    s = s.trim();

    // Debug.debug("clean 4", s);

    // if (s == null)
    // return null;

    // s = insertSpacesBeforeCaps(s);
    s = toTitleCase(s);

    return s;
  }

  private String toTitleCase(String s)
  {
    StringBuffer result = new StringBuffer();
    char prev = 0;

    // Debug.debug("toTitleCase before", s);
    char chars[] = s.toCharArray();
    for (int i = 0; i < chars.length; i++)
    {
      char c = chars[i];

      if (Character.isLetter(c))
      {
        if (i == 0)
          result.append(Character.toUpperCase(c));
        else if ((prev == '\''))
          // else if ((prev == '\'') && Character.isLetter(next))
          result.append(Character.toLowerCase(c));
        else if (!Character.isLetter(prev))
          result.append(Character.toUpperCase(c));
        else
          result.append(Character.toLowerCase(c));
      } else
        result.append(c);
      prev = c;
    }

    // Debug.debug("toTitleCase after", s);

    return result.toString();
  }

  private String insertSpacesBeforeCaps(String s)
  {
    StringBuffer result = new StringBuffer();

    char prev = 0;
    for (int i = 0; i < s.length(); i++)
    {
      char c = s.charAt(i);

      if (Character.isLetter(c) && (i > 0) && Character.isLetter(prev)
          && Character.isLowerCase(prev) && Character.isUpperCase(c))
        result.append(' ');

      result.append(c);

      prev = c;
    }

    return result.toString();
  }

  private String rectifyGeneric_1(String s, IMusicMetadata flags)
  {
    // Debug.debug("rectifyGeneric_1 a", s);
    // Debug.debug("rectifyGeneric_1 1", s);

    if (s == null)
      return null;

    s = s.trim();
    if (s.length() < 1)

      return null;
    // s = new MusicOrganizerFilter().getNewName2(s);
    s = clean(s, flags);

    // Debug.debug("rectifyGeneric_1 2", s);

    if (s == null)
      return null;

    s = TextUtils.replace(s, ".", ". ");
    s = TextUtils.replace(s, " .", " ");
    s = TextUtils.replace(s, "  ", " ");

    // Debug.debug("rectifyGeneric_1 b", s);

    if (new RE("^\\?+$").match(s))
    {
      // Debug.debug("discarding question...", s);
      return null;
    }

    // Debug.debug("rectifyGeneric_1 c", s);

    // Debug.debug("rectifyGeneric_1 6", s);

    while (s.startsWith("."))
      s = s.substring(1);

    s = TextUtils.replace(s, "Live @ ", "Live At ");
    s = TextUtils.replace(s, "Live@", "Live At ");

    // s = s.re

    if (s == null)
      return null;

    if (s.endsWith(", The"))
      s = "The " + s.substring(0, s.length() - 5);

    return s;
  }

  public String rectifySongTitle(String s)
  {
    return rectifySongTitle(
    // null, null,
        s, null);
  }

  // public String rectifySongTitle(Album album, String s)
  // {
  // return rectifySongTitle(null, album, s);
  // }
  //
  // public String rectifySongTitle(Artist artist, String s)
  // {
  // return rectifySongTitle(artist, null, s);
  // }

  public String rectifySongTitle(
  // Artist artist, Album album,
      String s, IMusicMetadata flags)
  {
    String old = s;
    // while (true)
    for (int i = 0; true; i++)
    {
      // Debug.debug("s(" + i + ")", s);

      s = rectifySongTitle_1(s, flags);
      if (s == null)
        return null;

      // if (artist != null)
      // {
      // if (s.startsWith(artist.name + " - "))
      // s = s.substring(artist.name.length() + 3);
      // if (s.endsWith(" - " + artist.name))
      // s = s.substring(0, s.length() - (artist.name.length() + 3));
      // }
      // if (album != null)
      // {
      // if (s.startsWith(album.name + " - "))
      // s = s.substring(album.name.length() + 3);
      // if (s.endsWith(" - " + album.name))
      // s = s.substring(0, s.length() - (album.name.length() + 3));
      // }

      if (s.equals(old))
        return s;
      old = s;
    }
  }

  private void parseTrackNumber(String s, IMusicMetadata flags)
  {
    if (flags == null)
      return;

    try
    {
      s = s.trim();
      Number number = new Integer(s);
      if (flags != null)
        flags.setTrackNumberNumeric(number);
      // Debug.debug(KEY_TRACK_NUMBER, number);
    } catch (NumberFormatException e)
    {
      Debug.debug("bad track number", s);
    } catch (Throwable e)
    {
      Debug.debug("s", s);
      Debug.debug(e);
    }
  }

  private String removeTrackNumbers(String s, IMusicMetadata flags)
  {
    if (s == null)
      return null;

    if (new RE("^(audio)? ?track ?[- ]?[0-9][0-9]?$")
        .match(s.toLowerCase()))
    {
      if (s.toLowerCase().startsWith("audio"))
        s = s.substring(5).trim();
      parseTrackNumber(s.substring(5), flags);
      // Debug.debug("discarding track...", s);
      return null;
    }
    if (new RE("^piste ?[- ]?[0-9][0-9]?$").match(s.toLowerCase()))
    {
      parseTrackNumber(s.substring(5), flags);
      // Debug.debug("discarding track...", s);
      return null;
    }

    // Debug.debug("removeTrackNumbers 1", s);

    if (new RE("^[0-9][0-9] - ").match(s)
        || new RE("^[0-9][0-9][0-9] - ").match(s)
        || new RE("^[aAbBcCdD][0-9] - ").match(s))
    {
      // Debug.debug("attempting to strip track number...", s);
      int index = s.indexOf('-');
      if (index >= 0)
      {
        String after = s.substring(index + 1).trim();
        // Debug.debug("\t" + "after", after);
        if (after.indexOf('-') < 0) // if mutiple -'s then ignore...
        {
          parseTrackNumber(s.substring(0, index), flags);
          s = after;
        }
      }
    }

    if (new RE("^\\([0-9][0-9]\\) ").match(s)
        || new RE("^\\([abcdABCD][0-9]\\) ").match(s))
    {
      // Debug.debug("attempting to strip track number...", s);
      int index = s.indexOf(')');
      if (index >= 0)
      {
        parseTrackNumber(s.substring(1, index), flags);
        s = s.substring(index + 1).trim();
      }
    }
    // Debug.debug("removeTrackNumbers 2", s);

    // if (new RE("^\\([0-9][0-9]\\) ").match(s))
    // {
    // // Debug.debug("attempting to strip track number...", s);
    // int index = s.indexOf(')');
    // if (index >= 0)
    // {
    // parseTrackNumber(s.substring(1, index), flags);
    // s = s.substring(index + 1).trim();
    // }
    // }

    // Debug.debug("removeTrackNumbers 3", s);

    return s;
  }

  private String rectifySongTitle_1(String s, IMusicMetadata flags)
  {
    s = rectifyGeneric_1(s, flags);
    if (s == null)
      return null;

    s = removeTrackNumbers(s, flags);

    if (s == null)
      return null;

    s = removeQuotes(s);

    return s;
  }

  private String removeSuffixes(String s, String suffixes[])
  {
    return removeSuffixes(s, new Vector(Arrays.asList(suffixes)));
  }

  private String removeSuffixes(String s, Vector suffixes)
  {
    // return removeSuffixes(s, suffixes, "");
    // }
    //
    // private String removeSuffixes(String s, String suffixes[],
    // String suffix_prefix)
    // {
    if (s == null)
      return null;

    for (int i = 0; i < suffixes.size(); i++)
    {
      String suffix = (String) suffixes.get(i);
      // suffix = suffix_prefix + suffix;

      if (s.toLowerCase().endsWith(suffix.toLowerCase()))
        s = s.substring(0, s.length() - suffix.length());
    }
    return s;
  }

  private String removePrefixes(String s, String prefixes[])
  {
    return removePrefixes(s, new Vector(Arrays.asList(prefixes)));
    // // return removePrefixes(s, prefixes, "");
    // // }
    // //
    // // private String removePrefixes(String s, String prefixes[],
    // // String prefix_suffix)
    // // {
    // if (s == null)
    // return null;
    //
    // for (int i = 0; i < prefixes.length; i++)
    // {
    // String prefix = prefixes[i];
    // // prefix = prefix + prefix_suffix;
    //
    // if (s.toLowerCase().startsWith(prefix.toLowerCase()))
    // s = s.substring(prefix.length());
    // }
    // return s;
  }

  private String removePrefixes(String s, Vector prefixes)
  {
    if (s == null)
      return null;

    for (int i = 0; i < prefixes.size(); i++)
    {
      String prefix = (String) prefixes.get(i);
      // prefix = prefix + prefix_suffix;

      if (s.toLowerCase().startsWith(prefix.toLowerCase()))
        s = s.substring(prefix.length());
    }
    return s;
  }

  private String discardMatches(String s, String patterns[])
  {
    if (s == null)
      return null;

    for (int i = 0; i < patterns.length; i++)
    {
      String prefix = patterns[i];
      if (s.equalsIgnoreCase(prefix))
        return null;
    }
    return s;
  }

  public String rectifyAlbum(String s)
  {
    return rectifyAlbum(s, null);
  }

  public String rectifyAlbum(String s, IMusicMetadata flags)
  {
    String old = s;
    while (true)
    {
      s = rectifyAlbum_1(s, flags);
      if (s == null)
        return null;

      // if (artist != null)
      // {
      // if (s.startsWith(artist.name + " - "))
      // s = s.substring(artist.name.length() + 3);
      // if (s.endsWith(" - " + artist.name))
      // s = s.substring(0, s.length() - (artist.name.length() + 3));
      // }

      if (s.equals(old))
        return s;
      old = s;
    }
  }

  private String removeYearPrefixSuffix(String s)
  {
    // Debug.debug("removeYearPrefixSuffix before", s);

    if (s == null)
      return null;

    if (new RE("^\\(199[0-9]\\)").match(s)
        || new RE("^\\(200[0-9]\\)").match(s))
      s = s.substring(7);

    if (new RE("^\\( 199[0-9] \\)").match(s)
        || new RE("^\\( 200[0-9] \\)").match(s))
      s = s.substring(9);

    if (new RE("\\(199[0-9]\\)$").match(s)
        || new RE("\\(200[0-9]\\)$").match(s))
      s = s.substring(0, s.length() - 7);

    if (new RE("\\( 199[0-9] \\)$").match(s)
        || new RE("\\( 200[0-9] \\)$").match(s))
      s = s.substring(0, s.length() - 9);

    if (new RE("199[0-9] - ").match(s) || new RE("200[0-9] - ").match(s))
    {
      int index = s.indexOf('-');
      if (index >= 0)
      {
        String temp = s.substring(index + 1);
        if (temp.indexOf('-') < 0)
          s = temp;
      }
    }

    if (new RE("- 199[0-9]").match(s) || new RE(" - 200[0-9]").match(s))
    {
      int index = s.lastIndexOf('-');
      if (index >= 0)
      {
        String temp = s.substring(0, index);
        if (temp.indexOf('-') < 0)
          s = temp;
      }
    }

    // Debug.debug("removeYearPrefixSuffix after", s);

    return s;
  }

  private static final String PATTERNS_ALBUM[] = { "dvd", //
      "10\"", //
      "12 - Inch", //
      "12 Inch", //
      "12 Inch Single", //
      "12\"", //
      "12\" Ep", //
      "12\" Vinyl", //
      "7 Inch", //
      "7\"", //
      "Advance", //
      "Advance Copy", //
      "Bonus Disc", //
      "Box", //
      "Cd", //
      "Cd Single", //
      "Cdm", //
      "Cdr", //
      "Cds", //
      "maxi", //
      "maxi single", //
      "Promo Cd", //
      "Ep", //
      "Full Vls", //
      // "Vls", //
      "Import", //
      "Lp", //
      // "Ost", //
      "Promo", //
      "Promo Cds", //
      "Retail", //
      "Single", //
      "Vinyl", //
      "Vinyl Single", //
      "Vls", //
      "cd", //
      "cds", //
      "ep", //
      "unknown album", //
      "Remastered", //
  };
  static
  {
    Arrays.sort(PATTERNS_ALBUM, MyComparator.kToStringLengthReverse);
  }

  private static final String PATTERNS_ARTIST[] = { "skit", //
      "live", //
  };
  static
  {
    Arrays.sort(PATTERNS_ARTIST, MyComparator.kToStringLengthReverse);
  }

  public String rectifyAlbum_1(String s, IMusicMetadata flags)
  {
    s = rectifyGeneric_1(s, flags);
    if (s == null)
      return null;

    s = removeSafePrefixSuffix(s, PATTERNS_ALBUM);
    if (s == null)
      return null;

    if (s.endsWith(" Box Set"))
    {
      if (flags != null)
        flags.setIsCompilation(Boolean.TRUE);
    }

    s = removeYearPrefixSuffix(s);

    s = removeURLs(s);

    s = removeQuotes(s);

    {
      String old = s;

      s = removeSafePrefixSuffix(s, DEFAULT_ACAPELLA, true);
      // s = removeSafePrefixSuffix(s, kDEFAULTS_acapella);

      if (s == null || !s.equals(old))
      {
        if (flags != null)
          flags.setIsAcapella(Boolean.TRUE);
      }
      if (s == null)
        return null;
    }

    if (s.endsWith(" !"))
      s = s.substring(0, s.length() - 2);
    else if (s.endsWith(" (!)"))
      s = s.substring(0, s.length() - 4);

    return s;
  }

  private String removeURLs(String s)
  {
    if (s == null)
      return null;

    {
      if (new RE("^http://").match(s.toLowerCase()))
        return null;
      // if (new RE("^[hH][tT][tT][pP]://").match(s))
      // return null;
    }

    {
      String temp = s;
      temp = TextUtils.replace(temp, ". ", ".");

      // Debug.debug("s1", s);
      RE re = new RE("^[\\w \\-]*\\.[\\w \\.\\-]*\\.(com|net|org|edu)$");
      // re.setMatchFlags(RE.MATCH_CASEINDEPENDENT);
      if (re.match(temp.toLowerCase()))
        return null;

      // Debug.debug("s2", s);

      // if (new RE(
      // "^[\\w \\-]*\\.[\\w \\.\\-]*\\.([cC][oO][mM]|[oO][rR][gG]|[nN][eE][tT])$")
      // .match(temp))
      // {
      // // Debug.debug("discarding album url...", s);
      // return null;
      // }
    }

    return s;
  }

  public String rectifyArtist(String s)
  {
    return rectifyArtist(s, null);
  }

  public String rectifyArtist(String s, IMusicMetadata flags)
  {
    String old = s;
    while (true)
    {
      s = rectifyArtist_1(s, flags);
      if (s == null)
        return null;

      // s = removeTrackNumbers(s);

      if (s.equals(old))
        return s;
      old = s;
    }
  }

  private String rectifyArtist_1(String s, IMusicMetadata flags)
  {
    // Debug.debug("rectifyArtist_1 1", s);

    s = rectifyGeneric_1(s, flags);
    if (s == null)
      return null;

    // Debug.debug("rectifyArtist_1 2", s);

    if (s.equalsIgnoreCase("unknown artist"))
      return null;

    // Debug.debug("rectifyArtist_1 3", s);

    s = removeTrackNumbers(s, flags);
    s = removeYearPrefixSuffix(s);

    s = removeSafePrefixSuffix(s, PATTERNS_ARTIST);
    if (s == null)
      return null;

    {
      String old = s;

      // s = removeSafePrefixSuffix(s, kDEFAULTS_acapella);
      s = removeSafePrefixSuffix(s, DEFAULT_ACAPELLA, true);

      if (s == null || !s.equals(old))
      {
        if (flags != null)
          flags.setIsAcapella(Boolean.TRUE);
      }
      if (s == null)
        return null;
    }

    s = removeQuotes(s);
    s = removeURLs(s);

    return s;
  }

  public String rectifyGenre(String s)
  {
    String old = s;
    while (true)
    {
      s = rectifyGenre_1(s);
      if (s == null)
        return null;
      if (s.equals(old))
        return s;
      old = s;
    }
  }

  private String rectifyGenre_1(String s)
  {
    s = rectifyGeneric_1(s, null);
    if (s == null)
      return null;

    if (s.equalsIgnoreCase("music"))
      return null;

    s = removeQuotes(s);

    s = TextUtils.replace(s, " - ", "-");

    s = removeSafePrefixSuffix(s, "�", true);
    s = removeSafePrefixSuffix(s, DEFAULT_ACAPELLA, true);

    return s;
  }

  public String rectifyPublisher(String s)
  {
    String old = s;
    while (true)
    {
      s = rectifyPublisher_1(s);
      if (s == null)
        return null;
      if (s.equals(old))
        return s;
      old = s;
    }
  }

  private String rectifyPublisher_1(String s)
  {
    s = rectifyGeneric_1(s, null);
    if (s == null)
      return null;

    s = removeURLs(s);

    s = removeQuotes(s);

    s = TextUtils.replace(s, " - ", "-");

    return s;
  }

  // public Vector splitName(String s, DatabaseNamedItem.Type type,
  // IMusicMetadata flags)
  // {
  // if (s == null)
  // return new Vector();
  //
  // // if (s.indexOf('/') < 0)
  // // return new Vector(Arrays.asList(new String[]{
  // // s,
  // // }));
  //
  // String splits[] = TextUtils.split(s, '/');
  // Vector v = new Vector(Arrays.asList(splits));
  //
  // // Debug.debug("v(" + id + ")", v.size());
  //
  // // v = removeDuplicates(v, type);
  // // return v;
  //
  // Vector result = new Vector();
  //
  // for (int i = 0; i < v.size(); i++)
  // {
  // String child = (String) v.get(i);
  // child = type.rectifyName(child, flags);
  // if (child == null)
  // continue;
  //
  // result.remove(child);
  // result.add(child);
  // }
  //
  // Collections.sort(result);
  //
  // return result;
  // }

  // private static class SecondaryArtistTag
  // {
  // public final String tag;
  // public final Number artist_type_id;
  //
  // public SecondaryArtistTag(String tag, Number artist_type_id)
  // {
  // this.artist_type_id = artist_type_id;
  // this.tag = tag;
  // }
  // }

  // private static final SecondaryArtistTag kSECONDARY_ARTIST_TAGS[] = {
  // new SecondaryArtistTag("f\\.", kARTIST_TYPE_FEATURING), //
  // new SecondaryArtistTag("ft\\.", kARTIST_TYPE_FEATURING), //
  // new SecondaryArtistTag("feat\\.", kARTIST_TYPE_FEATURING), //
  // new SecondaryArtistTag("featuring ", kARTIST_TYPE_FEATURING), //
  // new SecondaryArtistTag("produced by ", kARTIST_TYPE_PRODUCER), //
  // // new SecondaryArtistTag(" remix", kARTIST_TYPE_MIX_ARTIST), //
  // // new SecondaryArtistTag(" mix", kARTIST_TYPE_MIX_ARTIST), //
  // };

  private static final String FEATURING[] = { "f\\.", //
      "ft\\.", //
      "feat\\.", //
      "featuring ", //
  };

  private Vector listToNames(RE re)
  {
    Vector result = new Vector();

    int count = re.getParenCount();
    for (int i = 0; i < count / 2; i++)
    {
      String child = re.getParen(i * 2 + 1);
      // Debug.debug("child(" + i + ")", child);

      child = rectifyArtist(child);

      // Debug.debug("child.1(" + i + ")", child);
      result.remove(child);
      result.add(child);
    }
    return result;
  }

  private Vector debugRE(RE re)
  {
    Vector result = new Vector();

    int count = re.getParenCount();
    for (int i = 0; i < count; i++)
    {
      String child = re.getParen(i);
      Debug.debug("child(" + i + ")", child);
      result.add(child);
    }
    return result;
  }

  public String processFeaturing(String s, Vector primary_artists,
      Vector featured_artists, String pattern)
  {
    if (s == null)
      return null;

    RE re = new RE(pattern);
    if (re.match(s.toLowerCase()))
    {
      // Debug.debug("\t" + "featuring match(" + pattern + ")", s);
      // String wholeExpr = re.getParen(0);
      // Debug.debug("\t" + "wholeExpr", wholeExpr);
      // String insideParens = re.getParen(1);
      // Debug.debug("\t" + "insideParens", insideParens);

      int startInside = re.getParenStart(1);
      // Debug.debug("\t" + "startInside", startInside);
      int endInside = re.getParenEnd(1);
      // Debug.debug("\t" + "endInside", endInside);

      String left = s.substring(0, startInside);
      // Debug.debug("\t" + "left", left);
      String right = s.substring(startInside, endInside);
      // Debug.debug("\t" + "right", right);
      // int pattern_length = TextUtils.replace(pattern. "\\", "\");

      int index = Integer.MAX_VALUE;
      right = right.trim();

      {
        int i = right.indexOf('.');
        if (i >= 0)
          index = Math.min(index, i);
      }
      {
        int i = right.indexOf(' ');
        if (i >= 0)
          index = Math.min(index, i);
      }
      if (index < 0)
      {
        Debug.debug("\t" + "couldn't use match(" + pattern + ")", s);
        return s;
      }
      right = right.substring(index + 1);
      if (right.endsWith(")"))
        right = right.substring(0, right.length() - 1);
      if (left.endsWith("("))
        left = left.substring(0, left.length() - 1);
      right = right.trim();

      RE re2 = new RE("(.*)(,.*)*\\&(.*)");
      if (re2.match(right))
      {
        Vector v = listToNames(re2);
        featured_artists.removeAll(v);
        featured_artists.addAll(v);
      } else
      {
        re2 = new RE("(.*)(,.*)*\\ And (.*)");
        if (re2.match(right))
        {
          Vector v = listToNames(re2);
          featured_artists.removeAll(v);
          featured_artists.addAll(v);
        } else
        {
          // Debug.debug("simple featuring", right);
          right = rectifyArtist(right);
          featured_artists.remove(right);
          featured_artists.add(right);
        }
      }

      s = rectifyArtist(left);
      // Debug.debug("\t" + "s", s);
    }

    return s;
  }

  public void processFeaturing(String s, Vector primary_artists,
      Vector featured_artists)
  {
    // Debug.debug("processFeaturing", s);

    for (int i = 0; i < FEATURING.length; i++)
    {
      String regex = "\\((" + FEATURING[i] + ".*)\\)$";
      s = processFeaturing(s, primary_artists, featured_artists, regex);
    }
    for (int i = 0; i < FEATURING.length; i++)
    {
      String regex = "( " + FEATURING[i] + ".*$)";
      s = processFeaturing(s, primary_artists, featured_artists, regex);
    }

    if (primary_artists == null)
      return;

    s = rectifyArtist(s);
    primary_artists.remove(s);
    primary_artists.add(s);
  }

  private static final String ESCAPED = "^$.[|*+?\\(<)>#=/-{}";

  public String toRegexLiteral(String s)
  {
    StringBuffer result = new StringBuffer();

    char chars[] = s.toCharArray();
    for (int i = 0; i < chars.length; i++)
    {
      char c = chars[i];
      if (ESCAPED.indexOf(c) >= 0)
        result.append('\\');

      result.append(c);
    }
    return result.toString();
  }

  // private static final String kREGEX_OPEN_TOKENS = "\\(\\[\\{\\\"\\'";
  // private static final String kREGEX_CLOSE_TOKENS = "\\)\\]\\}\\\"\\'";

  public String getPrefixPattern(String s, boolean permissive)
  {
    return "^('" + s + "'|\\\"" + s + "\\\"|\\[" + s + "\\]|\\(" + s
        + "\\)|\\{" + s + "\\}|" + s + "\\-"
        + (permissive ? "|" + s + " " : "") + ")";
    // return ("^[" + kREGEX_OPEN_TOKENS + "]?" + s + " ?["
    // + (permissive ? " " : "") + "\\-" + kREGEX_CLOSE_TOKENS + "]");
  }

  public String getSuffixPattern(String s, boolean permissive)
  {
    return "('" + s + "'|\\\"" + s + "\\\"|\\[" + s + "\\]|\\(" + s
        + "\\)|\\{" + s + "\\}|\\-" + s + ""
        + (permissive ? "| " + s : "") + ")$";
    // return ("[" + (permissive ? " " : "") + "\\-" + kREGEX_OPEN_TOKENS
    // + "] ?" + s + "[" + kREGEX_CLOSE_TOKENS + "]?$");
  }

  public String getPrefixPattern2(String s)
  {
    return "^('.*'|\\\".*\\\"|\\[.*\\]|\\(.*\\)|\\{.*\\}|.*\\-) ?" + s
        + "$";
    // return ("^[" + kREGEX_OPEN_TOKENS + "]?" + "(.*)" + " ?["
    // // + (permissive ? " " : "")
    // + "\\-" + kREGEX_CLOSE_TOKENS + "] ?" + s + "$");
  }

  public String getSuffixPattern2(String s)
  {
    return "^" + s
        + " ?('.*'|\\\".*\\\"|\\[.*\\]|\\(.*\\)|\\{.*\\}|\\-.*)$";
    // return ("^" + s + " ?["
    // // + (permissive ? " " : "")
    // + "\\-" + kREGEX_OPEN_TOKENS + "] ?" + "(.*)" + "["
    // + kREGEX_CLOSE_TOKENS + "]?$");
  }

  private String stripRegexMatch(String s, String pattern)
  {
    if (s == null)
      return null;

    try
    {
      // Debug.debug("\t"+"s", s);
      // Debug.debug("\t"+"pattern", pattern);

      RE re = REGEX_CACHE.getRegEx(pattern);
      // RE re = new RE(pattern.toLowerCase());
      // Debug.debug("prefix", prefix_pattern);
      if (!re.match(s.toLowerCase()))
        return s;
      // String match = re.getParen(0);
      // Debug.debug("match(" + pattern + ")", s);
      s = s.substring(0, re.getParenStart(0))
          + s.substring(re.getParenEnd(0));
      // Debug.debug("updating...new_name ", new_name);
      return s;
    } catch (Exception e)
    {
      Debug.debug("s", s);
      Debug.debug("pattern", pattern);
      return s;
    }
  }

  private static class RegExCache
  {
    private final Map map = new Hashtable();
    // private final LinkedList order = new LinkedList();
    private static final int kMAX = 25000;

    public final RE getRegEx(String pattern)
    {
      if (pattern == null)
        return null;
      pattern = pattern.toLowerCase();
      RE result = (RE) map.get(pattern);
      if (result == null)
      {
        result = new RE(pattern);
        map.put(pattern, result);
      }
      // else
      // order.remove(pattern);
      // order.addFirst( pattern);
      //
      // if(order.size()>kMAX)
      // {
      // Object key = order.getLast();
      // order.removeLast();
      // map.remove(key);
      // }

      if (map.keySet().size() > kMAX)
      {
        Debug.debug("emptying regex cache.");
        map.clear();
      }
      return result;
    }
  }

  private static final RegExCache REGEX_CACHE = new RegExCache();

  private String extractRegexPattern(String s, String pattern, int paren)
  {
    if (s == null)
      return null;

    try
    {
      RE re = REGEX_CACHE.getRegEx(pattern);
      // RE re = new RE(pattern.toLowerCase());
      if (!re.match(s.toLowerCase()))
        return s;

      if (paren < re.getParenCount())
      {
        s = re.getParen(paren);
      }

      return s;
    } catch (Exception e)
    {
      Debug.debug("s", s);
      Debug.debug("pattern", pattern);
      return s;
    }
  }

  private String removeSafePrefixSuffix(String s, String patterns[])
  {
    return removeSafePrefixSuffix(s, patterns, false);
  }

  private String removeSafePrefixSuffix(String s, String patterns[],
      boolean permissive)
  {
    if (s == null)
      return null;

    for (int i = 0; s != null && i < patterns.length; i++)
    {
      String pattern = patterns[i];

      s = removeSafePrefixSuffixLiteral(s, pattern, permissive);
    }

    return s;
  }

  private String removeSafePrefixSuffixLiteral(String s, String pattern,
      boolean permissive)
  {
    return removeSafePrefixSuffix(s, toRegexLiteral(pattern), permissive);
  }

  private String removeSafePrefixSuffix(String s, String pattern)
  {
    return removeSafePrefixSuffix(s, pattern, false);
  }

  private String removeSafePrefixSuffix(String s, String pattern,
      boolean permissive)
  {
    if (s == null)
      return null;

    if (s.equalsIgnoreCase(pattern))
      return null;

    s = stripRegexMatch(s, getPrefixPattern((pattern), permissive));
    s = stripRegexMatch(s, getSuffixPattern((pattern), permissive));

    s = extractRegexPattern(s, getPrefixPattern2((pattern)), 1);
    s = extractRegexPattern(s, getSuffixPattern2((pattern)), 1);

    return s;
  }

  // public String cleanItemWithItem(String haystack, String needle,
  // DatabaseNamedItem.Type type, IMusicMetadata flags)
  // {
  // if (haystack == null)
  // return null;
  // if (needle == null)
  // return haystack;
  //
  // String s = removeSafePrefixSuffix(haystack, toRegexLiteral(needle),
  // false);
  // if (s == null || !s.equals(haystack))
  // s = type.rectifyName(s, flags);
  // return s;
  // }
  //
  // public String cleanItemWithItem(String haystack, Vector needles,
  // DatabaseNamedItem.Type type, IMusicMetadata flags)
  // {
  // if (haystack == null)
  // return null;
  //
  // for (int i = 0; needles != null && haystack != null
  // && i < needles.size(); i++)
  // {
  // String needle = (String) needles.get(i);
  // haystack = cleanItemWithItem(haystack, needle, type, flags);
  // }
  //
  // return haystack;
  // }
  //
  // public Vector cleanItemWithItem(Vector haystacks, Vector needles,
  // DatabaseNamedItem.Type type, IMusicMetadata flags)
  // {
  // if (haystacks == null)
  // return null;
  //
  // Vector result = new Vector();
  // for (int i = 0; haystacks != null && i < haystacks.size(); i++)
  // {
  // String haystack = (String) haystacks.get(i);
  // haystack = cleanItemWithItem(haystack, needles, type, flags);
  // if (haystack != null)
  // result.add(haystack);
  // }
  //
  // return result;
  // }
  //
  // public String cleanItemWithItem(DatabaseNamedItem haystack,
  // DatabaseNamedItem needle)
  // {
  // if (haystack == null)
  // return null;
  // if (needle == null)
  // return haystack.name;
  //
  // String haystack_name = haystack.name;
  // String needle_name = needle.name;
  //
  // String s = removeSafePrefixSuffix(haystack_name,
  // toRegexLiteral(needle_name), false);
  // if (s == null || !s.equals(haystack_name))
  // s = haystack.getType().rectifyName(s);
  // return s;
  // }

}
TOP

Related Classes of org.cmc.music.clean.MetadataCleanup$RegExCache

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.