Massive Algorithms: Multi Search - Cracking Coding Interview

Multi Search - Cracking Coding Interview

http://www.hawstein.com/posts/20.8.html

Given a string s and an array of smaller strings T, design a method to search s for each small string in T.

给一个字符串S和一个字符串数组T(T中的字符串要比S短许多)，设计一个算法，在字符串S中查找T中的字符串。

后缀Trie的查找效率很优秀，如果你要查找一个长度为n的字符串，只需要O(n)的时间，比较次数就是字符串的长度，相当给力。但是，构造字符串S的后缀Trie却需要O(m^2 )的时间， (m为S的长度)，及O(m^2 )的空间。

http://blog.csdn.net/navyifanr/article/details/24455275
http://skyzdalimit-techbuzz.blogspot.com/2011/12/given-string-s-and-array-of-smaller.html

public static void main(String[] args) {
String testString = "mississippi";
String[] stringList = {"is", "sip", "hi", "sis"};
SuffixTree tree = new SuffixTree(testString);
for (String s : stringList) {
ArrayList<Integer> list = tree.getIndexes(s);
if (list != null) {
System.out.println(s + ":" + list.toString());
}
}
}
}
class SuffixTree {
SuffixTreeNode root = new SuffixTreeNode();
public SuffixTree(String s) {
for (int i = 0; i < s.length(); i++) {
String suffix = s.substring(i);
root.insertString(suffix, i);
}
}
public ArrayList<Integer> getIndexes(String s) {
return root.getIndexes(s);
}
}
class SuffixTreeNode {
HashMap<Character, SuffixTreeNode> children = new
HashMap<Character, SuffixTreeNode>();
char value;
ArrayList<Integer> indexes = new ArrayList<Integer>();
public SuffixTreeNode() { }
public void insertString(String s, int index) {
indexes.add(index);
if (s != null && s.length() > 0) {
value = s.charAt(0);
SuffixTreeNode child = null;
if (children.containsKey(value)) {
child = children.get(value);
} else {
child = new SuffixTreeNode();
children.put(value, child);
}
String remainder = s.substring(1);
child.insertString(remainder, index);
}
}
public ArrayList<Integer> getIndexes(String s) {
if (s == null || s.length() == 0) {
return indexes;
} else {
char first = s.charAt(0);
if (children.containsKey(first)) {
String remainder = s.substring(1);
return children.get(first).getIndexes(remainder);
}
}
return null;
}

https://github.com/careercup/CtCI-6th-Edition/tree/master/Java/Ch%2017.%20Hard/Q17_17_Multi_Search

Alternatively, we can add all the smaller strings into a trie

public static Trie createTreeFromStrings(String[] smalls, int maxSize) {

Trie tree = new Trie();

for (String s : smalls) {

if (s.length() <= maxSize) {

tree.insertString(s, 0);

}

return tree;

}

public static ArrayList<String> findStringsAtLoc(TrieNode root, String big, int start) {

ArrayList<String> strings = new ArrayList<String>();

int index = start;

while (index < big.length()) {

root = root.getChild(big.charAt(index));

if (root == null) break;

if (root.terminates()) {

strings.add(big.substring(start, index + 1));

}

index++;

}

return strings;

}

public static void insertIntoHashMap(ArrayList<String> strings, HashMapList<String, Integer> map, int index) {

for (String s : strings) {

map.put(s, index);

}

public static HashMapList<String, Integer> searchAll(String big, String[] smalls) {

HashMapList<String, Integer> lookup = new HashMapList<String, Integer>();

TrieNode root = createTreeFromStrings(smalls, big.length()).getRoot();

for (int i = 0; i < big.length(); i++) {

ArrayList<String> strings = findStringsAtLoc(root, big, i);

insertIntoHashMap(strings, lookup, i);

}

return lookup;

}

public static void subtractValue(ArrayList<Integer> locations, int delta) {

if (locations == null) return;

for (int i = 0; i < locations.size(); i++) {

locations.set(i, locations.get(i) - delta);

}

public static Trie createTrieFromString(String s) {

Trie trie = new Trie();

for (int i = 0; i < s.length(); i++) {

String suffix = s.substring(i);

trie.insertString(suffix, i);

}

return trie;

}

public static HashMapList<String, Integer> searchAll(String big, String[] smalls) {

HashMapList<String, Integer> lookup = new HashMapList<String, Integer>();

Trie tree = createTrieFromString(big);

for (String s : smalls) {

/* Get terminating location of each occurrence.*/

ArrayList<Integer> locations = tree.search(s);

/* Adjust to starting location. */

subtractValue(locations, s.length());

/* Insert. */

lookup.put(s, locations);

}

return lookup;

}

public class Trie {

private TrieNode root = new TrieNode();

public ArrayList<Integer> search(String s) {

return root.search(s);

}

public void insertString(String str, int location) {

root.insertString(str, location);

}

public TrieNode getRoot() {

return root;

}

public class TrieNode {

private HashMap<Character, TrieNode> children;

private ArrayList<Integer> indexes;

public TrieNode() {

children = new HashMap<Character, TrieNode>();

indexes = new ArrayList<Integer>();

}

public void insertString(String s, int index) {

if (s == null) return;

indexes.add(index);

if (s.length() > 0) {

char value = s.charAt(0);

TrieNode child = null;

if (children.containsKey(value)) {

child = children.get(value);

} else {

child = new TrieNode();

children.put(value, child);

}

String remainder = s.substring(1);

child.insertString(remainder, index + 1);

} else {

children.put('\0', null);

}

public ArrayList<Integer> search(String s) {

if (s == null || s.length() == 0) {

return indexes;

} else {

char first = s.charAt(0);

if (children.containsKey(first)) {

String remainder = s.substring(1);

return children.get(first).search(remainder);

}

return null;

}

public boolean terminates() {

return children.containsKey('\0');

}

public TrieNode getChild(char c) {

return children.get(c);

}

public static boolean isSubstringAtLocation(String big, String small, int offset) {

for (int i = 0; i < small.length(); i++) {

if (big.charAt(offset + i) != small.charAt(i)) {

return false;

}

return true;

}

public static ArrayList<Integer> search(String big, String small) {

ArrayList<Integer> locations = new ArrayList<Integer>();

for (int i = 0; i < big.length() - small.length() + 1; i++) {

if (isSubstringAtLocation(big, small, i)) {

locations.add(i);

}

return locations;

}

public static HashMapList<String, Integer> searchAll(String big, String[] smalls) {

HashMapList<String, Integer> lookup = new HashMapList<String, Integer>();

for (String small : smalls) {

ArrayList<Integer> locations = search(big, small);

lookup.put(small, locations);

}

return lookup;

}

https://github.com/yxjiang/algorithms/blob/master/src/main/java/algorithm/cc150/chapter18/Question8.java

http://www.hawstein.com/posts/20.8.html

AC自动机算法是解决字符串多模式匹配的一个经典方法，时间复杂度为：O(m+kn+z)，其中：m是目标串S的长度，k是模式串个数，n是模式串平均长度，z是S 中出现的模式串数量。从时间复杂度上可以看出，AC自动机比后缀Trie方法要快， m从2次方降到了1次方。

AC自动机也会先构造一棵Trie树，不同的是，它用模式串来构造Trie树。然后遍历一次目标串S，即可求出哪些模式串出现在目标串S中。

关于AC自动机，比较好的资料是： Set Matching and Aho-Corasick Algorithm 它是生物序列算法课的一个课件，这个课的课件基本上都是关于字符串算法的，讲得挺好，推荐一读。
https://www.ideserve.co.in/learn/pattern-matching-using-trie

Multi Search - Cracking Coding Interview

Labels

Popular Posts