Related: LintCode 623 - K EDIT DISTANCE
Get Similar Words | tech::interview
下面举个例,假设字典为
http://www.mitbbs.com/article_t/JobHunting/32692817.html
int editDist(string word1, string word2){
int len1=word1.length();
int len2=word2.length();
int m[len1+1][len2+1];
//m[0][0]=0;
//init the matrix
for(int i=0; i<=len1; i++){
m[i][0]=i;
}
for(int j=0; j<=len2; j++)
m[0][j]=j;
//then update the matrix
for(int i=1; i<=len1; i++){
for(int j=1; j<=len2; j++){
m[i][j]=min(m[i-1][j-1]+(int)(word1[i-1]!=word2[j-1]), m[i-1][j]+1);
m[i][j]=min(m[i][j], m[i][j-1]+1);
}
}
return m[len1][len2];
}
vector<string> similar(string word, vector<string> &word_list, int k){
vector<string> ret;
int n= word_list.size();
if(n<1) return ret;
for(int i=0; i<n; i++){
if(editDist(word, word_list[i])<=k){
cout<<endl<<"dist: "<< editDist(word, word_list[i]);
ret.push_back(word_list[i]);
}
}
return ret;
}
我看到一篇文章,如果用trie的话,可以避免一些重复的计算,例如如果我们已经计算
了target word和mit之间的距离,当我们再计算target word和mits之间的距离时,就
可以省去一些计算,这是使用trie的意义。
Read full article from Get Similar Words | tech::interview
Get Similar Words | tech::interview
给定一个word list 和一个target word,要求输出在word list 里跟target word的edit distance 相差不大于k的所有words。这是Airbnb的电面题。直接用edit distance挨个遍历一遍也能做,但是如果word list很大,那重复计算会非常多,这时候可以用trie来优化。
下面举个例,假设字典为
["cs", "ct", "cby"]
,target word为"cat"
,k=1
。c/ | \b s t/y
从根节点开始搜索,遍历的单词分别为:
与普通的Edit distance动态规划算法一样,我们对每一个“路过”的单词记录一个DP table。那么所遍历的单词的DP table应该为(假设当前遍历单词,也就是下面代码中的
c -> cb -> (cby) -> cs -> ct
。与普通的Edit distance动态规划算法一样,我们对每一个“路过”的单词记录一个DP table。那么所遍历的单词的DP table应该为(假设当前遍历单词,也就是下面代码中的
path
为”c”):c a t[ 0 1 2 3 ] <- prev_dist-> c [ 1 0 1 2 ] <- cur_distcb [ 2 1 1 2 ]cs [ 2 1 1 2 ]ct [ 2 1 1 1 ]
每一行的最后一位数字即为当前单词与target word的edit distance。显然,如果该值小于等于k,则加入到结果中。最终返回的结果只有一个单词,即
注意,遍历到单词
ct
。注意,遍历到单词
cb
时,edit distance已经为2,且长度已经与cat
相等,也就意味着该节点的子树中所包含的单词与target word的edit distance无论如何都不可能小于等于k,因此直接从该子树返回。所以单词cby
是没有被遍历到的。这也就是Trie做这题的便利之处,当字典较大时会明显提高效率。
你说遍历到cb 就不继续遍历了 因为与 cat的distance已经有2了, 可是假如下一个字母是t呢,cbt 和 cat 不是distance也是1吗
BFS , level traversal
struct Node {Node(char val): value{val}{}~Node() {for(int i = 0; i < 27; ++i) {if (children[i]) {delete children[i];children[i] = nullptr;}}}Node* get(char c, bool auto_create = true) {assert((c >= 'a' && c <= 'z') || c == '\0');int id = c == '\0' ? 26 : c - 'a';if (!children[id] && auto_create) {children[id] = new Node(c);children[id]->parent = this;num_children++;}return children[id];}bool end() {return get('\0',false);}int num_children { 0 };char value{ 0 };Node* parent{ nullptr };Node* children[27] { nullptr };};public:Trie() {_root = new Node(-1);}~Trie() {if (_root) {delete _root;_root = nullptr;}}void add_string(const string& str) {auto node = _root;for (auto c : str) {node = node->get(c);}node->get('\0');}vector<string> get_similar(const string& s, int threshold) {vector<string> ret;function<void(Node*, string&, vector<int>&)> search = [&](Node* node,string& path, vector<int>& prev_dist) {if (node->end() && prev_dist.back() <= threshold)ret.push_back(path);for(int i = 0; i < 26; ++i) {if (!node->children[i]) continue;vector<int> cur_dist = prev_dist;cur_dist[0]++;char letter = 'a' + i;bool go = cur_dist[0] <= threshold;for (int len = 1; len <= s.size(); ++len) {int ins_cost = cur_dist[len - 1] + 1;int del_cost = prev_dist[len] + 1;int rep_cost = 0;if (s[len - 1] != letter)rep_cost = prev_dist[len - 1] + 1;elserep_cost = prev_dist[len - 1];cur_dist[len] = min(min(ins_cost, del_cost), rep_cost);if (cur_dist[len] <= threshold) go = true;}if (go) {path.push_back(letter);search(node->children[i], path, cur_dist);path.pop_back();}}};if(s.empty()) return ret;string p;vector<int> dist(s.size() + 1);iota(dist.begin(), dist.end(), 0);search(_root, p, dist);return ret;}private:Node* _root{ nullptr };
int editDist(string word1, string word2){
int len1=word1.length();
int len2=word2.length();
int m[len1+1][len2+1];
//m[0][0]=0;
//init the matrix
for(int i=0; i<=len1; i++){
m[i][0]=i;
}
for(int j=0; j<=len2; j++)
m[0][j]=j;
//then update the matrix
for(int i=1; i<=len1; i++){
for(int j=1; j<=len2; j++){
m[i][j]=min(m[i-1][j-1]+(int)(word1[i-1]!=word2[j-1]), m[i-1][j]+1);
m[i][j]=min(m[i][j], m[i][j-1]+1);
}
}
return m[len1][len2];
}
vector<string> similar(string word, vector<string> &word_list, int k){
vector<string> ret;
int n= word_list.size();
if(n<1) return ret;
for(int i=0; i<n; i++){
if(editDist(word, word_list[i])<=k){
cout<<endl<<"dist: "<< editDist(word, word_list[i]);
ret.push_back(word_list[i]);
}
}
return ret;
}
我看到一篇文章,如果用trie的话,可以避免一些重复的计算,例如如果我们已经计算
了target word和mit之间的距离,当我们再计算target word和mits之间的距离时,就
可以省去一些计算,这是使用trie的意义。
Read full article from Get Similar Words | tech::interview