Ultimate Sets and Maps for Java, Part I | Highly Scalable Blog
s a result we faced a necessity to design our own custom indexes and index processors with the following properties:
Cuckoo Hash Set
Array Set
it stores keys in the sorted array, contains() is implemented as a binary search. The shortcomings are obvious: inefficient add() operation and relatively low performance of binary search for large arrays. However, we can expect good performance and excellent memory footprint for small sets.
https://highlyscalable.wordpress.com/2012/01/01/ultimate-sets-and-maps-for-java-p2/
Bloom Filter
Reflector
StackSet
Read full article from Ultimate Sets and Maps for Java, Part I | Highly Scalable Blog
s a result we faced a necessity to design our own custom indexes and index processors with the following properties:
- Memory footprint of indexes should be as low as possible
- Index processing should be efficient in CPU utilization and memory allocation (produce a small amount of garbage objects)
- Indexes are mainly immutable i.e. performance of index building and modification is not significant
- Indexes should be partitionable i.e. it should be possible to distribute an index across the cluster
public class OpenAddressingSet { protected int keys[]; protected int capacity; protected int size; protected static final int FREE = -1; public OpenAddressingSet(int capacity, double loadFactor) { this.capacity = capacity; int tableSize = MathUtils.nextPrime( ((int)(capacity / loadFactor)) ); keys = new int[tableSize]; Arrays.fill(keys, FREE); } public boolean contains(int key) { return indexOfKey(key) >= 0; } public boolean add(int key) { if(size >= capacity) { throw new IllegalArgumentException("Set is full"); } boolean result; result = addInternal(key); if(result) { size++; } return result; } protected boolean addInternal(int key) { int position = indexOfKey(key); boolean added = false; if(position < 0) { position = -position-1; added = true; } keys[position] = key; return added; } private int indexOfKey(int key) { final int length = keys.length; int startPosition = key % length; // the first hash function int step = 1 + (key % (length-2)); // the second hash function while (keys[startPosition] != key && keys[startPosition] != FREE) { startPosition -= step; if (startPosition < 0) { startPosition += length; } } if(keys[startPosition] == FREE) { return -startPosition-1; } return startPosition; }}public class CuckooSet { private int[] keysT1; private int[] keysT2; private int tableLength; private int hashShift; private int rehashCounter = 0; private double loadFactor; private int capacity; private int insertRounds; private static final long H1 = 2654435761L; private static final long H2 = 0x6b43a9b5L; private static final long INT_MASK = 0x07FFFFFFF; private static final int FREE = -1; public CuckooSet(int capacity, double loadFactor, int insertRounds) { this.loadFactor = loadFactor; this.insertRounds = insertRounds; initializeCapacity(capacity, loadFactor); } public boolean add(int key) { if(contains(key)) { return false; } for(int i = 0; i < insertRounds; i++) { int h1 = h1(key); int register = keysT1[h1]; keysT1[h1] = key; if(register == FREE) { return true; } key = register; int h2 = h2(key); register = keysT2[h2]; keysT2[h2] = key; if(register == FREE) { return true; } key = register; } rehash(); return add(key); } public boolean contains(int key) { return keysT1[h1(key)] == key || keysT2[h2(key)] == key; } public int getRehashCounter() { return rehashCounter; } private void rehash() { int[] oldT1 = keysT1; int[] oldT2 = keysT2; int oldTableLength = tableLength; initializeCapacity(capacity * 2, loadFactor); for(int i = 0; i < oldTableLength; i++) { if(oldT1[i] != FREE) { add(oldT1[i]); } if(oldT2[i] != FREE) { add(oldT2[i]); } } rehashCounter++; } private void initializeCapacity(int capacity, double loadFactor) { this.capacity = capacity; tableLength = (int)(capacity / loadFactor); hashShift = 32 - (int)Math.ceil(Math.log(tableLength) / Math.log(2)); keysT1 = new int[tableLength]; keysT2 = new int[tableLength]; Arrays.fill(keysT1, FREE); Arrays.fill(keysT2, FREE); } private int h1(int key) { return ((int)( ((int)(key * H1) >> hashShift) & INT_MASK) ) % tableLength; } private int h2(int key) { return ((int)( ((int)(key * H2) >> hashShift) & INT_MASK) ) % tableLength; }}it stores keys in the sorted array, contains() is implemented as a binary search. The shortcomings are obvious: inefficient add() operation and relatively low performance of binary search for large arrays. However, we can expect good performance and excellent memory footprint for small sets.
public class ArraySet { private int[] array; private int size = 0; public ArraySet(int capacity) { array = new int[capacity]; Arrays.fill(array, -1); } public boolean add(int key) { int index = Arrays.binarySearch(array, 0, size, key); if (index < 0) { int insertIndex = -index-1; if(size < array.length - 1) { if(insertIndex < size) { System.arraycopy(array, insertIndex, array, insertIndex + 1, size - insertIndex); } array[insertIndex] = key; } else { int[] newArray = new int[array.length + 1]; System.arraycopy(array, 0, newArray, 0, insertIndex); System.arraycopy(array, insertIndex, newArray, insertIndex + 1, array.length - insertIndex); newArray[insertIndex] = key; array = newArray; } size++; return true; } return false; } public int get(int position) { return array[position]; } public int size() { return size; } public boolean contains(int key) { return Arrays.binarySearch(array, key) >= 0; }}Bloom Filter
Reflector
StackSet
Read full article from Ultimate Sets and Maps for Java, Part I | Highly Scalable Blog