How to Write a Spelling Corrector
http://www.ashishsharma.me/2012/01/spelling-corrector-algorithm-java-code.html
http://raelcunha.com/spell-correct.php
http://www.ashishsharma.me/2012/01/spelling-corrector-algorithm-java-code.html
// word to count map - how may times a word is present - or a weight attached to a word
public
static
Map<String, Integer> dictionary =
new
HashMap<String, Integer>();
public
static
String correct(String word) {
if
(dictionary.containsKey(word))
return
word;
// getting all possible edits of input word
List<String> edits = edits(word);
// Here we can either iterate through list of edits and find the 1st word that is in dictionary and return.
// Or we can go to below logic to return the word with most weight (that we would have already stored in dictionary map)
// map to hold the possible matches
Map<Integer, String> candidates =
new
HashMap<Integer, String>();
// keep checking the dictionary and populate the possible matches
// it stores the count (or weight) of word and the actual word
for
(String s : edits) {
if
(dictionary.containsKey(s)) {
candidates.put(dictionary.get(s), s);
}
}
// one we have found possible matches - we want to return most popular (most weight) word
if
(candidates.size() >
0
)
return
candidates.get(Collections.max(candidates.keySet()));
// If none matched.
// We will pick every word from edits and again do edits (using same previous logic) on that to see if anything matches
// We don't do recursion here because we don't the loop to continue infinitely if none matches
// If even after doing edits of edits, we don't find the correct word - then return.
for
(String s : edits) {
List<String> newEdits = edits(s);
for
(String ns : newEdits) {
if
(dictionary.containsKey(ns)) {
candidates.put(dictionary.get(ns), ns);
}
}
}
if
(candidates.size() >
0
)
return
candidates.get(Collections.max(candidates.keySet()));
else
return
word;
}
// Word EDITS
// Getting all possible corrections c of a given word w.
// It is the edit distance between two words: the number of edits it would take to turn one into the other
public
static
List<String> edits(String word) {
if
(word ==
null
|| word.isEmpty())
return
null
;
List<String> list =
new
ArrayList<String>();
String w =
null
;
// deletes (remove one letter)
for
(
int
i =
0
; i < word.length(); i++) {
w = word.substring(
0
, i) + word.substring(i +
1
);
// ith word skipped
list.add(w);
}
// transpose (swap adjacent letters)
// note here i is less than word.length() - 1
for
(
int
i =
0
; i < word.length() -
1
; i++) {
// < w.len -1 :: -1 because we swapped last 2 elements in go.
w = word.substring(
0
, i) + word.charAt(i +
1
) + word.charAt(i) + word.substring(i +
2
);
// swapping ith and i+1'th char
list.add(w);
}
// replace (change one letter to another)
for
(
int
i =
0
; i < word.length(); i++) {
for
(
char
c =
'a'
; c <=
'z'
; c++) {
w = word.substring(
0
, i) + c + word.substring(i +
1
);
// replacing ith char with all possible alphabets
list.add(w);
}
}
// insert (add a letter)
// note here i is less than and EQUAL to
for
(
int
i =
0
; i <= word.length(); i++) {
// <= because we want to insert the new char at the end as well
for
(
char
c =
'a'
; c <=
'z'
; c++) {
w = word.substring(
0
, i) + c + word.substring(i);
// inserting new char at ith position
list.add(w);
}
}
return
list;
}
class Spelling { private final HashMap<String, Integer> nWords = new HashMap<String, Integer>(); public Spelling(String file) throws IOException { BufferedReader in = new BufferedReader(new FileReader(file)); Pattern p = Pattern.compile("\\w+"); for(String temp = ""; temp != null; temp = in.readLine()){ Matcher m = p.matcher(temp.toLowerCase()); while(m.find()) nWords.put((temp = m.group()), nWords.containsKey(temp) ? nWords.get(temp) + 1 : 1); } in.close(); } private final ArrayList<String> edits(String word) { ArrayList<String> result = new ArrayList<String>(); for(int i=0; i < word.length(); ++i) result.add(word.substring(0, i) + word.substring(i+1)); for(int i=0; i < word.length()-1; ++i) result.add(word.substring(0, i) + word.substring(i+1, i+2) + word.substring(i, i+1) + word.substring(i+2)); for(int i=0; i < word.length(); ++i) for(char c='a'; c <= 'z'; ++c) result.add(word.substring(0, i) + String.valueOf(c) + word.substring(i+1)); for(int i=0; i <= word.length(); ++i) for(char c='a'; c <= 'z'; ++c) result.add(word.substring(0, i) + String.valueOf(c) + word.substring(i)); return result; } public final String correct(String word) { if(nWords.containsKey(word)) return word; ArrayList<String> list = edits(word); HashMap<Integer, String> candidates = new HashMap<Integer, String>(); for(String s : list) if(nWords.containsKey(s)) candidates.put(nWords.get(s),s); if(candidates.size() > 0) return candidates.get(Collections.max(candidates.keySet())); for(String s : list) for(String w : edits(s)) if(nWords.containsKey(w)) candidates.put(nWords.get(w),w); return candidates.size() > 0 ? candidates.get(Collections.max(candidates.keySet())) : word; } public static void main(String args[]) throws IOException { if(args.length > 0) System.out.println((new Spelling("big.txt")).correct(args[0])); } }
- P(c), the probability that a proposed correction c stands on its own. This is called the language model: think of it as answering the question "how likely is c to appear in an English text?" So P("the") would have a relatively high probability, while P("zxzxzxzyyy") would be near zero.
- P(w|c), the probability that w would be typed in a text when the author meant c. This is the error model: think of it as answering "how likely is it that the author would type w by mistake when c was intended?"
- argmaxc, the control mechanism, which says to enumerate all feasible values of c, and then choose the one that gives the best combined probability score.
def correct(word): candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word] return max(candidates, key=NWORDS.get)Read full article from How to Write a Spelling Corrector