How to Write a Spelling Corrector
http://www.ashishsharma.me/2012/01/spelling-corrector-algorithm-java-code.html
http://raelcunha.com/spell-correct.php
http://www.ashishsharma.me/2012/01/spelling-corrector-algorithm-java-code.html
// word to count map - how may times a word is present - or a weight attached to a word public static Map<String, Integer> dictionary = new HashMap<String, Integer>(); public static String correct(String word) { if(dictionary.containsKey(word)) return word; // getting all possible edits of input word List<String> edits = edits(word); // Here we can either iterate through list of edits and find the 1st word that is in dictionary and return. // Or we can go to below logic to return the word with most weight (that we would have already stored in dictionary map) // map to hold the possible matches Map<Integer, String> candidates = new HashMap<Integer, String>(); // keep checking the dictionary and populate the possible matches // it stores the count (or weight) of word and the actual word for(String s : edits) { if(dictionary.containsKey(s)) { candidates.put(dictionary.get(s), s); } } // one we have found possible matches - we want to return most popular (most weight) word if(candidates.size() > 0) return candidates.get(Collections.max(candidates.keySet())); // If none matched. // We will pick every word from edits and again do edits (using same previous logic) on that to see if anything matches // We don't do recursion here because we don't the loop to continue infinitely if none matches // If even after doing edits of edits, we don't find the correct word - then return. for(String s : edits) { List<String> newEdits = edits(s); for(String ns : newEdits) { if(dictionary.containsKey(ns)) { candidates.put(dictionary.get(ns), ns); } } } if(candidates.size() > 0) return candidates.get(Collections.max(candidates.keySet())); else return word; } // Word EDITS // Getting all possible corrections c of a given word w. // It is the edit distance between two words: the number of edits it would take to turn one into the other public static List<String> edits(String word) { if(word == null || word.isEmpty()) return null; List<String> list = new ArrayList<String>(); String w = null; // deletes (remove one letter) for (int i = 0; i < word.length(); i++) { w = word.substring(0, i) + word.substring(i + 1); // ith word skipped list.add(w); } // transpose (swap adjacent letters) // note here i is less than word.length() - 1 for (int i = 0; i < word.length() - 1; i++) { // < w.len -1 :: -1 because we swapped last 2 elements in go. w = word.substring(0, i) + word.charAt(i + 1) + word.charAt(i) + word.substring(i + 2); // swapping ith and i+1'th char list.add(w); } // replace (change one letter to another) for (int i = 0; i < word.length(); i++) { for (char c = 'a'; c <= 'z'; c++) { w = word.substring(0, i) + c + word.substring(i + 1); // replacing ith char with all possible alphabets list.add(w); } } // insert (add a letter) // note here i is less than and EQUAL to for (int i = 0; i <= word.length(); i++) { // <= because we want to insert the new char at the end as well for (char c = 'a'; c <= 'z'; c++) { w = word.substring(0, i) + c + word.substring(i); // inserting new char at ith position list.add(w); } } return list; }class Spelling { private final HashMap<String, Integer> nWords = new HashMap<String, Integer>(); public Spelling(String file) throws IOException { BufferedReader in = new BufferedReader(new FileReader(file)); Pattern p = Pattern.compile("\\w+"); for(String temp = ""; temp != null; temp = in.readLine()){ Matcher m = p.matcher(temp.toLowerCase()); while(m.find()) nWords.put((temp = m.group()), nWords.containsKey(temp) ? nWords.get(temp) + 1 : 1); } in.close(); } private final ArrayList<String> edits(String word) { ArrayList<String> result = new ArrayList<String>(); for(int i=0; i < word.length(); ++i) result.add(word.substring(0, i) + word.substring(i+1)); for(int i=0; i < word.length()-1; ++i) result.add(word.substring(0, i) + word.substring(i+1, i+2) + word.substring(i, i+1) + word.substring(i+2)); for(int i=0; i < word.length(); ++i) for(char c='a'; c <= 'z'; ++c) result.add(word.substring(0, i) + String.valueOf(c) + word.substring(i+1)); for(int i=0; i <= word.length(); ++i) for(char c='a'; c <= 'z'; ++c) result.add(word.substring(0, i) + String.valueOf(c) + word.substring(i)); return result; } public final String correct(String word) { if(nWords.containsKey(word)) return word; ArrayList<String> list = edits(word); HashMap<Integer, String> candidates = new HashMap<Integer, String>(); for(String s : list) if(nWords.containsKey(s)) candidates.put(nWords.get(s),s); if(candidates.size() > 0) return candidates.get(Collections.max(candidates.keySet())); for(String s : list) for(String w : edits(s)) if(nWords.containsKey(w)) candidates.put(nWords.get(w),w); return candidates.size() > 0 ? candidates.get(Collections.max(candidates.keySet())) : word; } public static void main(String args[]) throws IOException { if(args.length > 0) System.out.println((new Spelling("big.txt")).correct(args[0])); } }
- P(c), the probability that a proposed correction c stands on its own. This is called the language model: think of it as answering the question "how likely is c to appear in an English text?" So P("the") would have a relatively high probability, while P("zxzxzxzyyy") would be near zero.
- P(w|c), the probability that w would be typed in a text when the author meant c. This is the error model: think of it as answering "how likely is it that the author would type w by mistake when c was intended?"
- argmaxc, the control mechanism, which says to enumerate all feasible values of c, and then choose the one that gives the best combined probability score.
def correct(word): candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word] return max(candidates, key=NWORDS.get)Read full article from How to Write a Spelling Corrector