## Sunday, July 3, 2016

### Missing Int

/* 10.3 Given an input file with four billion non-negative integers, provide an
* algorithm to generate an integer which is not contained in the file. Assume
* you have 1 GB of memory available for this task. FOLLOW UP What if you have
* only 10 M8 of memory? Assume that all the values are distinct and we now have
* no more than one billion non-negative integers.

* There are a total of 2^32, or 4 billion, distinct integers possible(and 2^31
* non-negative integers). We have 1 GB of memory, or 8 billion bits.
*
* Thus, with 8 billion bits, we can map all possible integers to a distinct bit
* with the available memory. The logic is as follows: 1, Create a bit
* vector(BV) with 4 billion bits. Recal that a bit vector is an array that
* compactly stores between values by using an array of ints(or another data
* type) Each int stores a sequence of 32 bits, or boolean values. 2, Initialize
* BV with all 0's. 3, Scan all numbers(num) from the file and call
* BV.set(num,1). 4, Now scan again BV from the 0th index. 5, Return the first
* index which has a value of 0.
*/
public class GenerateInteger3 {
long numberOfInts = ((long) Integer.MAX_VALUE) + 1;
byte[] bitfield = new byte[(int) (numberOfInts / 8)];

void findOpenNumber() throws FileNotFoundException {
while (in.hasNextInt()) {
int n = in.nextInt();
/*
* finds the corrsponding number in the bitfield by using the OR
* operator to set the nth bit of a byte(e.g. , 10 would correspond
* to the 2nd bit of index 2 in the byte array
*/

bitfield[n / 8] |= 1 << (n % 8);
}

for (int i = 0; i < bitfield.length; i++) {
for (int j = 0; j < 8; j++) {
/*
* retrieves the individual bits of eachbyte. when 0 bit is
* found, finds the correspondingvalue
*/

if ((bitfield[i] & (1 << j)) == 0) {
System.out.println(i * 8 + j);
extracted();
return;
}
}
}
}

/*
* Follow Up: what if we have only 10 MB memory? It's possible to find a
* missing integer with two passes of the data set. We can divide up the
* integers into blocks of some size(we will discuss how to decide on a size
* later). Let's just assume that we divide up the integers into blocks of
* 1000. So, block 0 represents the numbers 0 through 999, block 1
* represents numbers 1000-1999, and so on.
* Since all the values are distinct, we know how many values we should find
* in each block. So, we search through the file and count how many values
* are between 0 and 999, how many are between 1000 and 1999, and so on. If
* we count only 999 values in a particular range, then we know that a
* missing int must be in that range.
* In the second pass, we will actually look for which number in that range
* is missing. We use the bit vector approach from the first part of this
* problem. We can ignore any number outside of this specific range.
* The question, now is what is the appropriate block size? Let's define
* some variables as follows: Let rangeSize be the size of the ranges that
* each block in the first pass represents. Let arraySize represent the
* number of blocks in the first pass. Note that arraySize=2^31/rangeSize,
* since there are 2^31 non-negative integers.
* We need to select a value for rangeSize such that the memory from the
* first pass(the array) and the second pass(the bit vector) fit.
*/

/*
* First pass: The array The array in the first pass can fit in 10
* megabytes, or roughly 2^23 bytes, of memory. Since each element in the
* array is an int, and an int is 4 bytes, we can hold an array of at most
* about 2^21 elements. So, we can deduce the following:
* arraySize=2^31/rangeSize<=s^21 rangeSize>=2^31/2^21 rangeSize>=2^10
*/

/*
* Second Pass: The Bit Vector We need to have enough space to store
* rangeSize bits. Since we can fit 2^23 bytes in memory, we can fit 2^26
* bits in memory. Therefore, we can conclude the following: 2^10
* rangeSize<=2^26 These conditions give us a good amount of wiggle room,
* but hte nearer to the middle that we pick, the less memory will be used
* at any given time.
*/
int bitsize = 1048576;// 2^20 bits(2^17 bytes)
int blockNum = 4096;// 2^12
byte[] bitfield1 = new byte[bitsize / 8];

int[] blocks = new int[blockNum];
private Scanner in;

@SuppressWarnings("resource")
void findOpenNumber1() throws FileNotFoundException {
int starting = -1;
Scanner in = new Scanner(new FileReader("file.txt"));
while (in.hasNextInt()) {
int n = in.nextInt();
blocks[n / (bitfield1.length * 8)]++;
}

for (int i = 0; i < blocks.length; i++) {
if (blocks[i] < bitfield1.length * 8) {
// if value < 2^20, then at least 1 number if missing
// in that section
starting = i * bitfield1.length * 8;
break;
}
}
while (in.hasNextInt()) {
int n = in.nextInt();
// if the number if inside the block that is missing
// numbers, we record it
if (n >= starting && n < starting + bitfield.length * 8) {
bitfield[(n - starting) / 8] |= 1 << ((n - starting) % 8);
}
}

for (int i = 0; i < bitfield.length; i++) {
for (int j = 0; j < 8; j++) {
// retrieves the individual bits of each byte,
// when 0 bit is found, finds the
// corresponding value
if ((bitfield[i] & (1 << j)) == 0) {
System.out.println(i * 8 + j + starting);
//extracted();
return;
}
}
}
}

private void extracted() {
return;
}

}

/*
* What if, as a potential follow up question, the interviewer asked you to
* solve the problem with even less memory? In this case we would do repeated
* passes using the approach from the first step. We would first check to see
* how many integers are found within each sequence of a million elements. Then,
* in the second pass, we would check how many integers are found in each
* sequence of a thousand elements. Finally, in the third pass, we would apply
* the bit vector.
*/

int findOpenNumber(String filename) throws FileNotFoundException {
int rangeSize = (1 << 20); // 2 A20 bits (2A17 bytes)
/* Get count of number of values within each block. */
int[] blocks = getCountPerBlock(filename, rangeSize);
/* Find a block with a missing value. */
int blocklndex = findBlockWithMissing(blocks, rangeSize);
if (blocklndex < 0) return -1;

/* Create bit vector for items within this range. */
byte[] bitVector = getBitVectorForRange(filename, blockindex, rangeSize);

/* Find a zero in the bit vector */
int offset findZero(bitVector);
if (offset < 0) return -1;

/* Compute missing value. */
return blockindex * rangeSize + offset;
}

/* Get count of items within each range. */
int[] getCountPerBlock(String filename, int rangeSize) throws FileNotFoundException {
int arraySize = Integer.MAX_VALUE / rangeSize + 1;
int[] blocks = new int[arraySize];

Scanner in = new Scanner (new FileReader(filename));
while (in.hasNextint()) {
int value = in.nextint();
blocks[value / rangeSize]++;
}
in.close();
return blocks;
}

/* Find a block whose count is low. */
int findBlockWithMissing(int[] blocks, int rangeSize) {
for (int i= 0; i < blocks.length; i++) {
if (blocks[i] < rangeSize) {
return i;
}
}
return -1;
}

/* Create a bit vector for the values within a specific range. */
byte[] getBitVectorForRange(String filename, int blockindex, int rangeSize)
throws FileNotFoundException {
int startRange = blockindex * rangeSize;
int endRange = startRange + rangeSize;
byte[] bitVector = new byte[rangeSize/Byte.SIZE];

Scanner in = new Scanner(new FileReader(filename));
while (in.hasNextint()) {
int value = in.nextint();
/* If the number is inside the block that's missing numbers, we record it */
if (startRange <= value && value < endRange) {
int offset = value - startRange;
int mask = (1 <<(offset% Byte.SIZE));
}
}
in.close();
return bitVector;

}

/* Find bit index that is 0 within byte. */
int findZero(byte b) {
for (int i= 0; i < Byte.SIZE; i++) {
if ((b & mask)== 0) {
return i;
}
}
return -1;
}

/* Find a zero within the bit vector and return the index. */
int findZero(byte[] bitVector) {
for (int i= 0; i < bitVector.length; i++) {
if (bitVector[i] != -0) {//If not all ls
int bitindex = findZero(bitVector[i]);
return i *Byte.SIZE+ bitindex;
}
}
return -1;
}