Ultimate Sets and Maps for Java, Part I | Highly Scalable Blog
s a result we faced a necessity to design our own custom indexes and index processors with the following properties:
Cuckoo Hash Set
Array Set
it stores keys in the sorted array, contains() is implemented as a binary search. The shortcomings are obvious: inefficient add() operation and relatively low performance of binary search for large arrays. However, we can expect good performance and excellent memory footprint for small sets.
https://highlyscalable.wordpress.com/2012/01/01/ultimate-sets-and-maps-for-java-p2/
Bloom Filter
Reflector
StackSet
Read full article from Ultimate Sets and Maps for Java, Part I | Highly Scalable Blog
s a result we faced a necessity to design our own custom indexes and index processors with the following properties:
- Memory footprint of indexes should be as low as possible
- Index processing should be efficient in CPU utilization and memory allocation (produce a small amount of garbage objects)
- Indexes are mainly immutable i.e. performance of index building and modification is not significant
- Indexes should be partitionable i.e. it should be possible to distribute an index across the cluster
public
class
OpenAddressingSet {
protected
int
keys[];
protected
int
capacity;
protected
int
size;
protected
static
final
int
FREE = -
1
;
public
OpenAddressingSet(
int
capacity,
double
loadFactor) {
this
.capacity = capacity;
int
tableSize = MathUtils.nextPrime( ((
int
)(capacity / loadFactor)) );
keys =
new
int
[tableSize];
Arrays.fill(keys, FREE);
}
public
boolean
contains(
int
key) {
return
indexOfKey(key) >=
0
;
}
public
boolean
add(
int
key) {
if
(size >= capacity) {
throw
new
IllegalArgumentException(
"Set is full"
);
}
boolean
result;
result = addInternal(key);
if
(result) {
size++;
}
return
result;
}
protected
boolean
addInternal(
int
key) {
int
position = indexOfKey(key);
boolean
added =
false
;
if
(position <
0
) {
position = -position-
1
;
added =
true
;
}
keys[position] = key;
return
added;
}
private
int
indexOfKey(
int
key) {
final
int
length = keys.length;
int
startPosition = key % length;
// the first hash function
int
step =
1
+ (key % (length-
2
));
// the second hash function
while
(keys[startPosition] != key && keys[startPosition] != FREE) {
startPosition -= step;
if
(startPosition <
0
) {
startPosition += length;
}
}
if
(keys[startPosition] == FREE) {
return
-startPosition-
1
;
}
return
startPosition;
}
}
public
class
CuckooSet {
private
int
[] keysT1;
private
int
[] keysT2;
private
int
tableLength;
private
int
hashShift;
private
int
rehashCounter =
0
;
private
double
loadFactor;
private
int
capacity;
private
int
insertRounds;
private
static
final
long
H1 = 2654435761L;
private
static
final
long
H2 = 0x6b43a9b5L;
private
static
final
long
INT_MASK =
0x07FFFFFFF
;
private
static
final
int
FREE = -
1
;
public
CuckooSet(
int
capacity,
double
loadFactor,
int
insertRounds) {
this
.loadFactor = loadFactor;
this
.insertRounds = insertRounds;
initializeCapacity(capacity, loadFactor);
}
public
boolean
add(
int
key) {
if
(contains(key)) {
return
false
;
}
for
(
int
i =
0
; i < insertRounds; i++) {
int
h1 = h1(key);
int
register = keysT1[h1];
keysT1[h1] = key;
if
(register == FREE) {
return
true
;
}
key = register;
int
h2 = h2(key);
register = keysT2[h2];
keysT2[h2] = key;
if
(register == FREE) {
return
true
;
}
key = register;
}
rehash();
return
add(key);
}
public
boolean
contains(
int
key) {
return
keysT1[h1(key)] == key || keysT2[h2(key)] == key;
}
public
int
getRehashCounter() {
return
rehashCounter;
}
private
void
rehash() {
int
[] oldT1 = keysT1;
int
[] oldT2 = keysT2;
int
oldTableLength = tableLength;
initializeCapacity(capacity *
2
, loadFactor);
for
(
int
i =
0
; i < oldTableLength; i++) {
if
(oldT1[i] != FREE) {
add(oldT1[i]);
}
if
(oldT2[i] != FREE) {
add(oldT2[i]);
}
}
rehashCounter++;
}
private
void
initializeCapacity(
int
capacity,
double
loadFactor) {
this
.capacity = capacity;
tableLength = (
int
)(capacity / loadFactor);
hashShift =
32
- (
int
)Math.ceil(Math.log(tableLength) / Math.log(
2
));
keysT1 =
new
int
[tableLength];
keysT2 =
new
int
[tableLength];
Arrays.fill(keysT1, FREE);
Arrays.fill(keysT2, FREE);
}
private
int
h1(
int
key) {
return
((
int
)( ((
int
)(key * H1) >> hashShift) & INT_MASK) ) % tableLength;
}
private
int
h2(
int
key) {
return
((
int
)( ((
int
)(key * H2) >> hashShift) & INT_MASK) ) % tableLength;
}
}
it stores keys in the sorted array, contains() is implemented as a binary search. The shortcomings are obvious: inefficient add() operation and relatively low performance of binary search for large arrays. However, we can expect good performance and excellent memory footprint for small sets.
public
class
ArraySet {
private
int
[] array;
private
int
size =
0
;
public
ArraySet(
int
capacity) {
array =
new
int
[capacity];
Arrays.fill(array, -
1
);
}
public
boolean
add(
int
key) {
int
index = Arrays.binarySearch(array,
0
, size, key);
if
(index <
0
) {
int
insertIndex = -index-
1
;
if
(size < array.length -
1
) {
if
(insertIndex < size) {
System.arraycopy(array, insertIndex, array, insertIndex +
1
, size - insertIndex);
}
array[insertIndex] = key;
}
else
{
int
[] newArray =
new
int
[array.length +
1
];
System.arraycopy(array,
0
, newArray,
0
, insertIndex);
System.arraycopy(array, insertIndex, newArray, insertIndex +
1
, array.length - insertIndex);
newArray[insertIndex] = key;
array = newArray;
}
size++;
return
true
;
}
return
false
;
}
public
int
get(
int
position) {
return
array[position];
}
public
int
size() {
return
size;
}
public
boolean
contains(
int
key) {
return
Arrays.binarySearch(array, key) >=
0
;
}
}
Bloom Filter
Reflector
StackSet
Read full article from Ultimate Sets and Maps for Java, Part I | Highly Scalable Blog