Package me.shenfeng.mmseg

Source Code of me.shenfeng.mmseg.HashSet

package me.shenfeng.mmseg;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.HashMap;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class Counter {

    private HashMap<Object, Integer> counter = new HashMap<Object, Integer>();

    public void add(Object key) {
        Integer i = counter.get(key);
        if (i == null) {
            i = 0;
        }
        i++;
        counter.put(key, i);
    }

    public String toString() {
        return counter.toString();
    }
}

class HashSet {
    private Object[] data;
    final int prime;

    public HashSet(int count) {
        prime = getPrime(count);
        data = new Object[prime];
    }

    public void insert(Word w) {
        int i = w.hashCode() % prime;
        Object val = data[i];
        if (val == null) {
            data[i] = w;
        } else {
            Word[] arr;
            if (val instanceof Word) {
                arr = new Word[] { (Word) val };
            } else {
                arr = (Word[]) val;
            }
            arr = Arrays.copyOf(arr, arr.length + 1);
            arr[arr.length - 1] = w;
            data[i] = arr;
        }
    }

    public Counter getLoad() {
        Counter c = new Counter();
        for (Object slot : data) {
            if (slot == null) {
                c.add(0);
            } else if (slot instanceof Word) {
                c.add(1);
            } else {
                c.add(((Word[]) slot).length);
            }
        }
        return c;
    }

    public boolean contains(Word w) {
        int i = w.hashCode() % prime;
        Object val = data[i];
        if (val instanceof Word) {
            return w.equals(val);
        } else if (val instanceof Word[]) {
            Word arr[] = (Word[]) val;
            for (Word word : arr) {
                if (word.equals(w)) {
                    return true;
                }
            }
        }

        return false;
    }

    private int getPrime(int n) {
        int prev = 0;
        BigInteger b = BigInteger.valueOf(n);
        int prime = 0;
        int max = n + n / 2;
        while ((prime = b.nextProbablePrime().intValue()) < max) {
            prev = prime;
            b = BigInteger.valueOf(prime);
        }
        return prev;
    }
}

public class HashSetDictionary implements Dictionary {
    private Logger logger = LoggerFactory.getLogger(HashSetDictionary.class);

    public HashSetDictionary(InputStream is) throws IOException {
        load(is);
    }

    private int maxWordLength = 0;
    private HashSet set;

    private void load(InputStream file) throws IOException {
        long start = System.currentTimeMillis();
        char buffer[] = new char[1024 * 768];
        int offsets[] = new int[1024 * 40];
        int lengths[] = new int[1024 * 40];
        InputStreamReader fr = new InputStreamReader(file);
        int charIdx = 0;
        int wordIdx = 0;
        int length = 0;
        int read = 0;
        while ((read = fr.read()) != -1) {
            if (read == '\r') { // ignore
            } else if (read == '\n') {
                if (length != 0) {
                    if (wordIdx == offsets.length) {
                        offsets = Arrays.copyOf(offsets, wordIdx * 2);
                        lengths = Arrays.copyOf(lengths, wordIdx * 2);
                    }
                    lengths[wordIdx] = length;
                    offsets[wordIdx] = charIdx - length;
                    wordIdx++;
                    length = 0;
                }
            } else {
                if (charIdx == buffer.length) {
                    buffer = Arrays.copyOf(buffer, charIdx * 2);
                }
                length++;
                buffer[charIdx++] = (char) read;
            }
        }
        buffer = Arrays.copyOf(buffer, charIdx);
        set = new HashSet(wordIdx);
        for (int i = 0; i < wordIdx; i++) {
            if (lengths[i] > maxWordLength) {
                maxWordLength = lengths[i];
            }
            Word w = new Word(buffer, offsets[i], lengths[i]);
            set.insert(w);
        }
        long time = System.currentTimeMillis() - start;
//        logger.info(
//                "load: {}ms, word: {}, max word length: {}, bucket: {}, hash: {}",
//                new Object[] { time, wordIdx, maxWordLength, set.prime,
//                        set.getLoad() });
    }

    public int maxMath(char[] buffer, int offset, int length) {
        int maxLength = Math.min(length, maxWordLength);
        for (int i = maxLength; i > 1; --i) {
            Word w = new Word(buffer, offset, i);
            if (set.contains(w)) {
                return i;
            }
        }
        return 1;
    }
}
TOP

Related Classes of me.shenfeng.mmseg.HashSet

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.