Find the k most frequent words from a file | GeeksforGeeks
Given a book of words. Assume you have enough main memory to accommodate all words. design a data structure to find top K maximum occurring words. The data structure should be dynamic so that new words can be added.
We can use Trie and Min Heap to get the k most frequent words efficiently. The idea is to use Trie for searching existing words adding new words efficiently. Trie also stores count of occurrences of words. A Min Heap of size k is used to keep track of k most frequent words at any point of time(Use of Min Heap is same as we used it to find k largest elements in this post).
Trie and Min Heap are linked with each other by storing an additional field in Trie ‘indexMinHeap’ and a pointer ‘trNode’ in Min Heap. The value of ‘indexMinHeap’ is maintained as -1 for the words which are currently not in Min Heap (or currently not among the top k frequent words). For the words which are present in Min Heap, ‘indexMinHeap’ contains, index of the word in Min Heap. The pointer ‘trNode’ in Min Heap points to the leaf node corresponding to the word in Trie.
http://www.zrzahid.com/top-k-or-k-most-frequent-words-in-a-document/
A more efficient O(n) time and O(k) solution
http://algorithmsandme.com/2014/12/find-n-most-frequently-occurring-words-in-a-file/
Read full article from Find the k most frequent words from a file | GeeksforGeeks
Given a book of words. Assume you have enough main memory to accommodate all words. design a data structure to find top K maximum occurring words. The data structure should be dynamic so that new words can be added.
We can use Trie and Min Heap to get the k most frequent words efficiently. The idea is to use Trie for searching existing words adding new words efficiently. Trie also stores count of occurrences of words. A Min Heap of size k is used to keep track of k most frequent words at any point of time(Use of Min Heap is same as we used it to find k largest elements in this post).
Trie and Min Heap are linked with each other by storing an additional field in Trie ‘indexMinHeap’ and a pointer ‘trNode’ in Min Heap. The value of ‘indexMinHeap’ is maintained as -1 for the words which are currently not in Min Heap (or currently not among the top k frequent words). For the words which are present in Min Heap, ‘indexMinHeap’ contains, index of the word in Min Heap. The pointer ‘trNode’ in Min Heap points to the leaf node corresponding to the word in Trie.
struct
TrieNode
{
bool
isEnd;
// indicates end of word
unsigned frequency;
// the number of occurrences of a word
int
indexMinHeap;
// the index of the word in minHeap
TrieNode* child[MAX_CHARS];
// represents 26 slots each for 'a' to 'z'.
};
// A Min Heap node
struct
MinHeapNode
{
TrieNode* root;
// indicates the leaf node of TRIE
unsigned frequency;
// number of occurrences
char
* word;
// the actual word stored
};
// A Min Heap
struct
MinHeap
{
unsigned capacity;
// the total size a min heap
int
count;
// indicates the number of slots filled.
MinHeapNode* array;
// represents the collection of minHeapNodes
};
void
printKMostFreq(
FILE
* fp,
int
k )
{
// Create a Min Heap of Size k
MinHeap* minHeap = createMinHeap( k );
// Create an empty Trie
TrieNode* root = NULL;
// A buffer to store one word at a time
char
buffer[MAX_WORD_SIZE];
// Read words one by one from file. Insert the word in Trie and Min Heap
while
(
fscanf
( fp,
"%s"
, buffer ) != EOF )
insertTrieAndHeap(buffer, &root, minHeap);
// The Min Heap will have the k most frequent words, so print Min Heap nodes
displayMinHeap( minHeap );
}
// Inserts a new word to both Trie and Heap
void
insertUtil ( TrieNode** root, MinHeap* minHeap,
const
char
* word,
const
char
* dupWord )
{
// Base Case
if
( *root == NULL )
*root = newTrieNode();
// There are still more characters in word
if
( *word !=
'\0'
)
insertUtil ( &((*root)->child[
tolower
( *word ) - 97 ]),
minHeap, word + 1, dupWord );
else
// The complete word is processed
{
// word is already present, increase the frequency
if
( (*root)->isEnd )
++( (*root)->frequency );
else
{
(*root)->isEnd = 1;
(*root)->frequency = 1;
}
// Insert in min heap also
insertInMinHeap( minHeap, root, dupWord );
}
}
// add a word to Trie & min heap. A wrapper over the insertUtil
void
insertTrieAndHeap(
const
char
*word, TrieNode** root, MinHeap* minHeap)
{
insertUtil( root, minHeap, word, word );
}
// Inserts a word to heap, the function handles the 3 cases explained above
void
insertInMinHeap( MinHeap* minHeap, TrieNode** root,
const
char
* word )
{
// Case 1: the word is already present in minHeap
if
( (*root)->indexMinHeap != -1 )
{
++( minHeap->array[ (*root)->indexMinHeap ]. frequency );
// percolate down
minHeapify( minHeap, (*root)->indexMinHeap );
}
// Case 2: Word is not present and heap is not full
else
if
( minHeap->count < minHeap->capacity )
{
int
count = minHeap->count;
minHeap->array[ count ]. frequency = (*root)->frequency;
minHeap->array[ count ]. word =
new
char
[
strlen
( word ) + 1];
strcpy
( minHeap->array[ count ]. word, word );
minHeap->array[ count ]. root = *root;
(*root)->indexMinHeap = minHeap->count;
++( minHeap->count );
buildMinHeap( minHeap );
}
// Case 3: Word is not present and heap is full. And frequency of word
// is more than root. The root is the least frequent word in heap,
// replace root with new word
else
if
( (*root)->frequency > minHeap->array[0]. frequency )
{
minHeap->array[ 0 ]. root->indexMinHeap = -1;
minHeap->array[ 0 ]. root = *root;
minHeap->array[ 0 ]. root->indexMinHeap = 0;
minHeap->array[ 0 ]. frequency = (*root)->frequency;
// delete previously allocated memoory and
delete
[] minHeap->array[ 0 ]. word;
minHeap->array[ 0 ]. word =
new
char
[
strlen
( word ) + 1];
strcpy
( minHeap->array[ 0 ]. word, word );
minHeapify ( minHeap, 0 );
}
}
http://www.zrzahid.com/top-k-or-k-most-frequent-words-in-a-document/
A more efficient O(n) time and O(k) solution
public String[] topKWordsSelect(final String stream, final int k) { final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); final String[] words = stream.toLowerCase().trim().split(" "); for (final String word : words) { int freq = 1; if (frequencyMap.containsKey(word)) { freq = frequencyMap.get(word) + 1; // use MutableInt } // update the frequency map frequencyMap.put(word, freq); } // Find kth largest frequency which is same as (n-k)th smallest frequency final int[] frequencies = new int[frequencyMap.size()]; int i = 0; for (final int value : frequencyMap.values()) { frequencies[i++] = value; } final int kthLargestFreq = kthSmallest(frequencies, 0, i - 1, i - k); // extract the top K final String[] topK = new String[k]; i = 0; for (final java.util.Map.Entry<String, Integer> entry : frequencyMap.entrySet()) { if (entry.getValue().intValue() >= kthLargestFreq) { topK[i++] = entry.getKey(); if (i == k) { break; } } } return topK; }
private static void swap(final int input[], final int i, final int j) { final int temp = input[i]; input[i] = input[j]; input[j] = temp; } private static int partition(final int[] A, final int p, final int r) { final double pivot = A[r]; int i = p - 1; int j = p; for (j = p; j < r; j++) { if (A[j] <= pivot) { swap(A, ++i, j); } } swap(A, i + 1, r); return i + 1; } private static int RandomizedPartition(final int[] A, final int p, final int r) { final int i = (int) Math.round(p + Math.random() * (r - p)); swap(A, i, r); return partition(A, p, r); } public static int kthSmallest(final int[] A, final int p, final int r, final int k) { if (p < r) { final int q = RandomizedPartition(A, p, r); final int n = q - p + 1; if (k == n) { return A[q]; } else if (k < n) { return kthSmallest(A, p, q - 1, k); } else { return kthSmallest(A, q + 1, r, k - n); } } else { return Integer.MIN_VALUE; } }O(nlgk) solution with O(n) space
public String[] topKWords(final String stream, final int k) { final class WordFreq implements Comparable<WordFreq> { String word; int freq; public WordFreq(final String w, final int c) { word = w; freq = c; } @Override public int compareTo(final WordFreq other) { return Integer.compare(this.freq, other.freq); } } final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); final PriorityQueue<WordFreq> topKHeap = new PriorityQueue<WordFreq>(k); final String[] words = stream.toLowerCase().trim().split(" "); for (final String word : words) { int freq = 1; if (frequencyMap.containsKey(word)) { freq = frequencyMap.get(word) + 1; } // update the frequency map frequencyMap.put(word, freq); } // Build the topK heap for (final java.util.Map.Entry<String, Integer> entry : frequencyMap.entrySet()) { if (topKHeap.size() < k) { topKHeap.add(new WordFreq(entry.getKey(), entry.getValue())); } else if (entry.getValue() > topKHeap.peek().freq) { topKHeap.remove(); topKHeap.add(new WordFreq(entry.getKey(), entry.getValue())); } } // extract the top K final String[] topK = new String[k]; int i = 0; while (topKHeap.size() > 0) { topK[i++] = topKHeap.remove().word; } return topK; }
tr -c '[:alnum:]' '[\n*]' < test.txt | sort | uniq -c | sort -nr | head -10
If you don't put in a
sort
before the uniq -c
you'll probably get a lot of false singleton words.uniq
only does unique runs of lines, not overall uniquness.
$ sed -e 's/\s/\n/g' < test.txt | sort | uniq -c | sort -nr | head -10
AWK:
function wordfrequency() {
awk '
BEGIN { FS="[^a-zA-Z]+" } {
for (i=1; i<=NF; i++) {
word = tolower($i)
words[word]++
}
}
END {
for (w in words)
printf("%3d %s\n", words[w], w)
} ' | sort -rn
}
$ cat your_file.txt | wordfrequency | head -10
http://www.robbiehaoyan.com/interview/find-the-k-most-frequent-words-in-a-file/http://algorithmsandme.com/2014/12/find-n-most-frequently-occurring-words-in-a-file/
Read full article from Find the k most frequent words from a file | GeeksforGeeks