Massive Algorithms: Find the k most frequent words from a file

Find the k most frequent words from a file | GeeksforGeeks

Find the k most frequent words from a file | GeeksforGeeks
Given a book of words. Assume you have enough main memory to accommodate all words. design a data structure to find top K maximum occurring words. The data structure should be dynamic so that new words can be added.
We can use Trie and Min Heap to get the k most frequent words efficiently. The idea is to use Trie for searching existing words adding new words efficiently. Trie also stores count of occurrences of words. A Min Heap of size k is used to keep track of k most frequent words at any point of time(Use of Min Heap is same as we used it to find k largest elements in this post).

Trie and Min Heap are linked with each other by storing an additional field in Trie ‘indexMinHeap’ and a pointer ‘trNode’ in Min Heap. The value of ‘indexMinHeap’ is maintained as -1 for the words which are currently not in Min Heap (or currently not among the top k frequent words). For the words which are present in Min Heap, ‘indexMinHeap’ contains, index of the word in Min Heap. The pointer ‘trNode’ in Min Heap points to the leaf node corresponding to the word in Trie.

struct TrieNode

{

    bool isEnd; // indicates end of word

    unsigned frequency;  // the number of occurrences of a word

    int indexMinHeap; // the index of the word in minHeap

    TrieNode* child[MAX_CHARS]; // represents 26 slots each for 'a' to 'z'.

};

// A Min Heap node

struct MinHeapNode

{

    TrieNode* root; // indicates the leaf node of TRIE

    unsigned frequency; //  number of occurrences

    char* word; // the actual word stored

};

// A Min Heap

struct MinHeap

{

    unsigned capacity; // the total size a min heap

    int count; // indicates the number of slots filled.

    MinHeapNode* array; //  represents the collection of minHeapNodes

};

void printKMostFreq( FILE* fp, int k )

{

    // Create a Min Heap of Size k

    MinHeap* minHeap = createMinHeap( k );

    // Create an empty Trie

    TrieNode* root = NULL;

    // A buffer to store one word at a time

    char buffer[MAX_WORD_SIZE];

    // Read words one by one from file.  Insert the word in Trie and Min Heap

    while( fscanf( fp, "%s", buffer ) != EOF )

        insertTrieAndHeap(buffer, &root, minHeap);

    // The Min Heap will have the k most frequent words, so print Min Heap nodes

    displayMinHeap( minHeap );

}

// Inserts a new word to both Trie and Heap

void insertUtil ( TrieNode** root, MinHeap* minHeap,

                        const char* word, const char* dupWord )

{

    // Base Case

    if ( *root == NULL )

        *root = newTrieNode();

    //  There are still more characters in word

    if ( *word != '\0' )

        insertUtil ( &((*root)->child[ tolower( *word ) - 97 ]),

                         minHeap, word + 1, dupWord );

    else // The complete word is processed

{

        // word is already present, increase the frequency

        if ( (*root)->isEnd )

            ++( (*root)->frequency );

        else

{

            (*root)->isEnd = 1;

            (*root)->frequency = 1;

}

        // Insert in min heap also

        insertInMinHeap( minHeap, root, dupWord );

}

}

// add a word to Trie & min heap.  A wrapper over the insertUtil

void insertTrieAndHeap(const char *word, TrieNode** root, MinHeap* minHeap)

{

    insertUtil( root, minHeap, word, word );

}

// Inserts a word to heap, the function handles the 3 cases explained above

void insertInMinHeap( MinHeap* minHeap, TrieNode** root, const char* word )

{

    // Case 1: the word is already present in minHeap

    if( (*root)->indexMinHeap != -1 )

{

        ++( minHeap->array[ (*root)->indexMinHeap ]. frequency );

        // percolate down

        minHeapify( minHeap, (*root)->indexMinHeap );

}

    // Case 2: Word is not present and heap is not full

    else if( minHeap->count < minHeap->capacity )

{

        int count = minHeap->count;

        minHeap->array[ count ]. frequency = (*root)->frequency;

        minHeap->array[ count ]. word = new char [strlen( word ) + 1];

        strcpy( minHeap->array[ count ]. word, word );

        minHeap->array[ count ]. root = *root;

        (*root)->indexMinHeap = minHeap->count;

        ++( minHeap->count );

        buildMinHeap( minHeap );

}

    // Case 3: Word is not present and heap is full. And frequency of word

    // is more than root. The root is the least frequent word in heap,

    // replace root with new word

    else if ( (*root)->frequency > minHeap->array[0]. frequency )

{

        minHeap->array[ 0 ]. root->indexMinHeap = -1;

        minHeap->array[ 0 ]. root = *root;

        minHeap->array[ 0 ]. root->indexMinHeap = 0;

        minHeap->array[ 0 ]. frequency = (*root)->frequency;

        // delete previously allocated memoory and

        delete [] minHeap->array[ 0 ]. word;

        minHeap->array[ 0 ]. word = new char [strlen( word ) + 1];

        strcpy( minHeap->array[ 0 ]. word, word );

        minHeapify ( minHeap, 0 );

}

}

http://www.zrzahid.com/top-k-or-k-most-frequent-words-in-a-document/
A more efficient O(n) time and O(k) solution

public String[] topKWordsSelect(final String stream, final int k) {
    final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();

    final String[] words = stream.toLowerCase().trim().split(" ");
    for (final String word : words) {
        int freq = 1;
        if (frequencyMap.containsKey(word)) {
            freq = frequencyMap.get(word) + 1; // use MutableInt
        }

        // update the frequency map
        frequencyMap.put(word, freq);
    }

    // Find kth largest frequency which is same as (n-k)th smallest frequency
    final int[] frequencies = new int[frequencyMap.size()];
    int i = 0;
    for (final int value : frequencyMap.values()) {
        frequencies[i++] = value;
    }
    final int kthLargestFreq = kthSmallest(frequencies, 0, i - 1, i - k);

    // extract the top K
    final String[] topK = new String[k];
    i = 0;
    for (final java.util.Map.Entry<String, Integer> entry : frequencyMap.entrySet()) {
        if (entry.getValue().intValue() >= kthLargestFreq) {
            topK[i++] = entry.getKey();
            if (i == k) {
                break;
            }
        }
    }

    return topK;
}

private static void swap(final int input[], final int i, final int j) {
    final int temp = input[i];
    input[i] = input[j];
    input[j] = temp;
}

private static int partition(final int[] A, final int p, final int r) {
    final double pivot = A[r];
    int i = p - 1;
    int j = p;

    for (j = p; j < r; j++) {
        if (A[j] <= pivot) {
            swap(A, ++i, j);
        }
    }

    swap(A, i + 1, r);
    return i + 1;
}

private static int RandomizedPartition(final int[] A, final int p, final int r) {
    final int i = (int) Math.round(p + Math.random() * (r - p));
    swap(A, i, r);
    return partition(A, p, r);
}

public static int kthSmallest(final int[] A, final int p, final int r, final int k) {
    if (p < r) {
        final int q = RandomizedPartition(A, p, r);

        final int n = q - p + 1;
        if (k == n) {
            return A[q];
        } else if (k < n) {
            return kthSmallest(A, p, q - 1, k);
        } else {
            return kthSmallest(A, q + 1, r, k - n);
        }
    } else {
        return Integer.MIN_VALUE;
    }
}

O(nlgk) solution with O(n) space

public String[] topKWords(final String stream, final int k) {
    final class WordFreq implements Comparable<WordFreq> {
        String word;
        int freq;

        public WordFreq(final String w, final int c) {
            word = w;
            freq = c;
        }

        @Override
        public int compareTo(final WordFreq other) {
            return Integer.compare(this.freq, other.freq);
        }
    }
    final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
    final PriorityQueue<WordFreq> topKHeap = new PriorityQueue<WordFreq>(k);

    final String[] words = stream.toLowerCase().trim().split(" ");
    for (final String word : words) {
        int freq = 1;
        if (frequencyMap.containsKey(word)) {
            freq = frequencyMap.get(word) + 1;
        }

        // update the frequency map
        frequencyMap.put(word, freq);
    }

    // Build the topK heap
    for (final java.util.Map.Entry<String, Integer> entry : frequencyMap.entrySet()) {
        if (topKHeap.size() < k) {
            topKHeap.add(new WordFreq(entry.getKey(), entry.getValue()));
        } else if (entry.getValue() > topKHeap.peek().freq) {
            topKHeap.remove();
            topKHeap.add(new WordFreq(entry.getKey(), entry.getValue()));
        }
    }

    // extract the top K
    final String[] topK = new String[k];
    int i = 0;
    while (topKHeap.size() > 0) {
        topK[i++] = topKHeap.remove().word;
    }

    return topK;
}

Linux: find n most frequent words in a file

tr -c '[:alnum:]' '[\n*]' < test.txt | sort | uniq -c | sort -nr | head  -10

If you don't put in a sort before the uniq -c you'll probably get a lot of false singleton words.uniq only does unique runs of lines, not overall uniquness.

$ sed -e 's/\s/\n/g' < test.txt | sort | uniq -c | sort -nr | head -10

AWK:

function wordfrequency() {
  awk '
     BEGIN { FS="[^a-zA-Z]+" } {
         for (i=1; i<=NF; i++) {
             word = tolower($i)
             words[word]++
         }
     }
     END {
         for (w in words)
              printf("%3d %s\n", words[w], w)
     } ' | sort -rn
}

$ cat your_file.txt | wordfrequency | head -10

http://www.robbiehaoyan.com/interview/find-the-k-most-frequent-words-in-a-file/
http://algorithmsandme.com/2014/12/find-n-most-frequently-occurring-words-in-a-file/
Read full article from Find the k most frequent words from a file | GeeksforGeeks

Find the k most frequent words from a file | GeeksforGeeks

Labels

Popular Posts