Source Code of cc.mallet.types.StringEditFeatureVectorSequence

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */








/** 
   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */


package cc.mallet.types;


import java.io.*;
import java.util.regex.*;
import java.util.HashMap;
import gnu.trove.TObjectIntHashMap;
import java.util.Set;
import java.util.Iterator;


// xxx A not very space-efficient version.  I'll compress it later.


public class StringEditFeatureVectorSequence extends FeatureVectorSequence implements Serializable
{
  private int string1Length, string2Length;
  private String string1, string2;
  private String[] string1Blocks, string2Blocks;
  private TObjectIntHashMap string1Present, string2Present;
  private TObjectIntHashMap lexicon;
  private int[] block1Indices, block2Indices;
  private char delim = ':';
  private static final char defaultDelimiter = ':';


  public StringEditFeatureVectorSequence (FeatureVector[] featureVectors, String s1, String s2)
  {
    this (featureVectors, s1, s2, defaultDelimiter);
  }


  public StringEditFeatureVectorSequence(FeatureVector[] featureVectors, String s1, String s2, char delimiter)
  {
    this (featureVectors, s1, s2, delimiter, null);
  }


  public StringEditFeatureVectorSequence(FeatureVector[] featureVectors, String s1, String s2, HashMap lexic)
  {
    this (featureVectors, s1, s2, defaultDelimiter, lexic);
  }


  public StringEditFeatureVectorSequence(FeatureVector[] featureVectors, String s1, String s2, char delimiter, HashMap lexic)
  {
    super (featureVectors);
    this.delim = delimiter;
    
    this.lexicon = new TObjectIntHashMap();
    if (lexic != null) {
      Set keys = lexic.keySet();
      java.util.Iterator iter = keys.iterator();
      while (iter.hasNext())
        this.lexicon.put((String) iter.next(), 1);
    }


    this.string1 = s1;
    this.string2 = s2;
    this.string1Length = s1.length() + 2;
    this.string2Length = s2.length() + 2;
    string1Blocks = string1.split("" + delim);
    string2Blocks = string2.split("" + delim);
    string1Present = new TObjectIntHashMap();
    string2Present = new TObjectIntHashMap();
    block1Indices = new int[string1Length];
    if (string1Blocks.length > 0) {
      int whichBlock = 0;
      block1Indices[0] = whichBlock++;
      for (int i = 0; i < string1Blocks.length; i++)
        string1Present.put(string1Blocks[i], 1);
      for (int i = 1; i < string1Length-1; i++)
        block1Indices[i] = ((string1.charAt(i-1) == delim) ? whichBlock++ : -1);
      block1Indices[string1Length-1] = -1;
    }
    block2Indices = new int[string2Length];
    if (string2Blocks.length > 0) {
      int whichBlock = 0;
      block2Indices[0] = whichBlock++;
      for (int i = 0; i < string2Blocks.length; i++)
        string2Present.put(string2Blocks[i], 1);
      for (int i = 1; i < string2Length - 1; i++)
        block2Indices[i] = ((string2.charAt(i-1) == delim) ? whichBlock++ : -1);
      block2Indices[string2Length-1] = -1;
    }
  }
 
  public String getString1() {
    return string1;
  }


  public String getString2() {
    return string2;
  }


  public int getString1Length () {
    return string1Length;
  }


  public int getString2Length () {
    return string2Length;
  }


  // End of Block
  public int getString1EOBIndex(String delimiter) {
    return getString1EOBIndex(delimiter, 0);
  }


  public int getString1EOBIndex(String delimiter, int start) {
    return getString1IndexOf(delimiter, start);
  }


  public String getString1BlockAtIndex(int idx) {
    if (idx < 0 || idx >= block1Indices.length || block1Indices[idx] < 0 || block1Indices[idx] >= string1Blocks.length) return null;
    else return string1Blocks[block1Indices[idx]];
  }


  public int getString1IndexOf(String str, int start) {
    int toret = string1.indexOf(str, start);
  
    if (toret == -1)
      toret = string1.length() - 1 - start;
    else
      toret = toret - start;


    if (toret < 1)
      return -1;


    return toret;
  }


  public boolean isPresent1(String patternStr) {
    Pattern p = Pattern.compile(patternStr);
    Matcher m = p.matcher(string1);
    boolean b = m.matches();


    return b;
  }


  public boolean isPresentInString1(String str) {
    return string1Present.containsKey(str);
  }


  public char getString1Char(int index) {
    index = index - 1;
    if (index < 0 || index >= string1.length()) return (char) 0;
    else return string1.charAt(index);
  }


  public int getString2EOBIndex(String delimiter) {
    return getString2EOBIndex(delimiter, 0);
  }


  public int getString2EOBIndex(String delimiter, int start) {
    return getString2IndexOf(delimiter, start);
  }


  public String getString2BlockAtIndex(int idx) {
    if (idx < 0 || idx >= block2Indices.length || block2Indices[idx] < 0 || block2Indices[idx] >= string2Blocks.length) return null;
    else return string2Blocks[block2Indices[idx]];
  }


  public boolean isPresentInString2(String str) {
    return string2Present.containsKey(str);
  }


  public int getString2IndexOf(String str, int start) {
    int toret = string2.indexOf(str, start);
  
    if (toret == -1)
      toret = string2.length() - 1 - start;
    else
      toret = toret - start;


    if (toret < 1)
      return -1;


    return toret;
  }


  public boolean isPresent2(String patternStr) {
    Pattern p = Pattern.compile(patternStr);
    Matcher m = p.matcher(string2);
    boolean b = m.matches();


    return b;
  }


  public char getString2Char(int index) {
    index = index - 1;
    if (index < 0 || index >= string2.length()) return (char) 0;
    else return string2.charAt(index);
  }


  public boolean isInLexicon(String str) {
    if (lexicon == null || str == null) return false;


    return lexicon.containsKey(str);
  }


  public String toString ()
  {
    StringBuffer sb = new StringBuffer ();
    sb.append (super.toString());
    sb.append ('\n');
    sb.append ("String 1: " + string1Length + " String 2: " + string2Length);


    return sb.toString();
  }


  // Serialization of Instance


  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 0;
  private static final int NULL_INTEGER = -1;


  private void writeObject (ObjectOutputStream out) throws IOException {
    out.writeInt (CURRENT_SERIAL_VERSION);
    out.writeInt (string1Length);
    out.writeInt (string2Length);
    out.writeObject (string1);
    out.writeObject (string2);


    if (string1Blocks == null) {
      out.writeInt(NULL_INTEGER);
    }
    else {
      int size = string1Blocks.length;
      out.writeInt(size);
      for(int i=0; i<size; i++) {
        out.writeObject(string1Blocks[i]);
      }
    }


    if (string2Blocks == null) {
      out.writeInt(NULL_INTEGER);
    }
    else {
      int size = string2Blocks.length;
      out.writeInt(size);
      for(int i=0; i<size; i++) {
        out.writeObject(string2Blocks[i]);
      }
    }


    out.writeObject(string1Present); 
    out.writeObject(string2Present); 
    out.writeObject(lexicon); 


    if (block1Indices == null) {
      out.writeInt(NULL_INTEGER);
    }
    else {
      int size = block1Indices.length;
      out.writeInt(size);
      for (int i=0; i<size; i++) {
        out.writeInt(block1Indices[i]);
      }
    }


    if (block2Indices == null) {
      out.writeInt(NULL_INTEGER);
    }
    else {
      int size = block2Indices.length;
      out.writeInt(size);
      for (int i=0; i<size; i++) {
        out.writeInt(block2Indices[i]);
      }
    }


    out.writeChar(delim);
  }


  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
    int version = in.readInt ();
    int string1Length = in.readInt();
    int string2Length = in.readInt();
    String string1 = (String) in.readObject();
    String string2 = (String) in.readObject();
    int size = in.readInt();
    if (size == NULL_INTEGER) {
      string1Blocks = null;
    }
    else {
      string1Blocks = new String[size];
      for (int i = 0; i<size; i++) {
        string1Blocks[i] = (String) in.readObject();
      }
    }


    size = in.readInt();
    if (size == NULL_INTEGER) {
      string2Blocks = null;
    }
    else {
      string2Blocks = new String[size];
      for (int i = 0; i<size; i++) {
        string2Blocks[i] = (String) in.readObject();
      }
    }


    TObjectIntHashMap string1Present = (TObjectIntHashMap) in.readObject();
    TObjectIntHashMap string2Present = (TObjectIntHashMap) in.readObject();
    TObjectIntHashMap lexicon = (TObjectIntHashMap) in.readObject();


    size = in.readInt();
    if (size == NULL_INTEGER) {
      block1Indices = null;
    }
    else {
      block1Indices = new int[size];
      for (int i = 0; i<size; i++) {
        block1Indices[i] = in.readInt();
      }
    }


    size = in.readInt();
    if (size == NULL_INTEGER) {
      block2Indices = null;
    }
    else {
      block2Indices = new int[size];
      for (int i = 0; i<size; i++) {
        block2Indices[i] = in.readInt();
      }
    }


    delim = in.readChar();
  }
}
Source Code of cc.mallet.types.StringEditFeatureVectorSequence

Related Classes of cc.mallet.types.StringEditFeatureVectorSequence