Substring with Concatenation of All Words 题解

题意

You are given a string, s, and a list of words, words, that are all of the same length. Find all starting indices of substring(s) in s that is a concatenation of each word in words exactly once and without any intervening characters.

For example, given:
s: "barfoothefoobarman"
words: ["foo", "bar"]

You should return the indices: [0,9].
(order does not matter).

Subscribe to see which companies asked this question

大概来说，就是给定一串字符串和单词数组，找到字符串中，也就是子串必须全部包含单词数组中的单词，要求必须连续，顺序可以不要求，其中单词数组中的单词的个数是固定的，还有单词可以是重复的；

思路

其实最简单的思路就是对字符串进行逐次遍历，先找到第一个匹配的单词，这又要去往单词数组中去遍历，也就是其复杂时间为（字符串的长度*单词数组的单词的个数），虽然这种方法较为简单，但是其实花销是比较大的，同时需要注意的地方也是比较多的。所以在我参考一些代码之后，发现一些好的方法－包括双map，使用队列，使用trie树等等；

实现

我的实现（最简单容易理解）

vector<int> findSubstring1(string s, vector<string>& words) {
    vector<int> result;
    size_t word_len = words[0].length();
    multimap<string, bool> maps;
    for (size_t j = 0; j < words.size(); j++) {
        maps.insert(make_pair(words[j], false));
    }
    for (size_t i = 0; i < s.length(); i++) {
        for (size_t j = 0; j < words.size(); j++) {
            for (auto beg = maps.lower_bound(words[j]), end = maps.upper_bound(words[j]); beg != end; ++beg) {
                beg->second = false;
            }
        }
        
        //先找到第一个单词在子串中的位置
        string subs = s.substr(i, word_len);
        size_t first_pos = -1;
        for (size_t j = 0; j < words.size(); j++) {
            if (words[j] == subs) {
                first_pos = i;
                auto item = maps.find(words[j]);
                item->second = true;
            }
        }
        
        //找第一个单词以后的所有单词，如果成功则返回开始的下标
        if (first_pos != -1) {
            size_t last_pos = first_pos + words.size() * word_len;
            bool isValid = true;
            size_t k = first_pos + word_len;
            for (; k < last_pos; k+=word_len) {
                if (k + word_len > s.length()) {
                    isValid = false;
                    break;
                }
                string osubs = s.substr(k, word_len);
                auto item = maps.find(osubs);
                auto itemcnt = maps.count(osubs);
                
                if (item != maps.end()) {
                    if (item->second == false) {
                        item->second = true;
                    }
                    else if (itemcnt > 1) {
                        bool ishave = false;
                        for (auto beg = ++item, end = maps.upper_bound(item->first); beg != end; ++beg) {
                            if (!beg->second) {
                                beg->second = true;
                                ishave = true;
                                break;
                            }
                        }
                        // 全部已经访问过了
                        if (!ishave) {
                            isValid = false;
                        }
                    }
                    else if (itemcnt == 1) {
                        isValid = false;
                    }
                }
                else {
                    isValid = false;
                }
            }
            
            // 坐标位置不正确，不成功
            if (k != last_pos) {
                isValid = false;
            }
            
            //没有全部访问过，不成功
            for (size_t q = 0; q < words.size(); q++) {
                for (auto beg = maps.lower_bound(words[q]), end = maps.upper_bound(words[q]); beg != end; ++beg) {
                    if (!beg->second) {
                        isValid = false;
                        break;
                    }
                }
            }
            
            //成功则加入结果中
            if(isValid) {
                result.push_back((int)first_pos);
            }
        }
    }
    return result;
}

双map（最基础的优化）

/**
 *  默认的简化的方法，利用unorder_map进行判断，维护一个left值
 *  也就是全部单词字符串开始的地方
 *
 *  @param s     <#s description#>
 *  @param words <#words description#>
 *
 *  @return <#return value description#>
 */
vector<int> findSubstring2(string s, vector<string>& words) {
    vector<int> ans;
    int n = s.size(), cnt = words.size();
    if (n <= 0 || cnt <= 0) {
        return ans;
    }
    
    // 单词的hash数组，初始化
    unordered_map<string, int> dict;
    for (int i = 0; i < cnt; ++i) dict[words[i]]++;
    
    int wl = words[0].length();
    for (int i = 0; i < wl; ++i) {
        // left为起始单词串的下标
        int left = i, count = 0;
        unordered_map<string, int> tdict;
        for (int j = i; j <= n - wl; j+=wl) {
            string str = s.substr(j, wl);
            // 计算单词数组中是否存在
            if (dict.count(str)) {
                tdict[str]++;
                // 计算已访问的单词个数
                if (tdict[str] <= dict[str]) {
                    count++;
                }
                else {
                    // 字符串中存在连续相同的单词，并且已经大于了单词数组中的个数，
                    // 这时需要向右进行移动
                    while (tdict[str] > dict[str]) {
                        string str1 = s.substr(left, wl);
                        tdict[str1]--;
                        if (tdict[str1] < dict[str1]) {
                            count--;
                        }
                        left += wl;
                    }
                }
                
                //如果访问个数相同，则成功
                if (count == cnt) {
                    ans.push_back(left);
                    tdict[s.substr(left, wl)]--;
                    count--;
                    left += wl;
                }
            }
            else {
                // 失败，重新统计
                count = 0;
                tdict.clear();
                left += wl;
            }
        }
    }
    return ans;
}

使用队列

/**
 *  这个方法比较复杂，比较难想懂，
 *  利用每个单词对应一个队列，并且队列中存储每个单词出现的下标（初始情况均为－1）
 *  根据下标去判断该单词的访问情况，或者第一次访问（－1），或者第n次访问（下标）等等
 */
typedef unordered_map<string, queue<int>> wordItr;
vector<int> findSubstring3(string s, vector<string>& words) {
    vector<int> res;
    if (words.size() == 0)
        return res;
    if (s.length() == 0)
        return res;
    int wordlen = words[0].size();
    if (s.size() < wordlen) return res;
    
    wordItr wordHash;
    wordItr::iterator it;
    queue<int> q;
    q.push(-1);
    
    // 对哈希表进行初始化，存在则往队列中添加-1
    for (int i = 0; i < words.size(); i++) {
        it = wordHash.find(words[i]);
        if (it == wordHash.end()) {
            wordHash[words[i]] = q;
        }
        else {
            it->second.push(-1);
        }
    }
    
    wordItr temp = wordHash;
    for (int i = 0; i < wordlen; i++) {
        int curWordCnt = 0; //已经访问单词的个数
        wordHash = temp;
        for (int j = i; j <= s.size() - wordlen; j += wordlen) {
            string str = s.substr(j, wordlen);
            it = wordHash.find(str);
            // 哈希数组里面是否存在字符串的key
            if (it == wordHash.end()) {
                curWordCnt = 0;
            }
            else {
                // 访问队列
                int lastPos = it->second.front();
                // 如果为－1则表明第一次访问该单词
                if (lastPos == -1) {
                    curWordCnt++;
                }
                // ??
                else if (curWordCnt * wordlen < j - lastPos) {
                    curWordCnt++;
                }
                // 在访问完一次所有单词以后，重复出现该单词，该位置已经发生变化
                else {
                    curWordCnt = (j - lastPos)/wordlen;
                }
                it->second.pop();
                it->second.push(j); //该单词出现的下标
                
                // 测试...
                queue<int> tque = it->second;
                while (!tque.empty()) {
                    cout << it->first << "->" << tque.front();
                    tque.pop();
                }
                
                cout << endl;
                
                // 当前访问单词个数已经访问完
                if (curWordCnt == words.size()) {
                    res.push_back((int)(j - wordlen * (words.size() - 1)));
                }
            }
        }
    }
    return res;
}

Trie树

/**
 *  这个方法可能更难想到，因为是用的trie树，
 *  相较于前面的哈希，这里使用trie树进行适配
 *
 *  @param s     <#s description#>
 *  @param words <#words description#>
 *
 *  @return <#return value description#>
 */
class TrieNode {
public:
    TrieNode* child[26];
    int cnt;
    TrieNode(): cnt(0) {
        memset(child, NULL, sizeof(TrieNode*) * 26);//分配空间
    }
};

class Trie {
    TrieNode* root;
public:
    Trie() {
        root = new TrieNode();
    }
    
    TrieNode* getRoot() {
        return root;
    }
    
    void buildTrie(vector<string> words) {
        for (string word : words) {
            addWord(word);
        }
    }
    
    void addWord(string& word) {
        TrieNode* cur = root;
        for (int i = 0; i < word.size(); i++) {
            char m = word[i] - 'a';
            if (!cur->child[m]) {
                cur->child[m] = new TrieNode();
            }
            cur = cur->child[m];
        }
        cur->cnt++;
    }
};

Trie* trie;
/**
 *  利用递归将字符串中的所有单词用trie树进行查找，找不到则表明不符合
 *  我觉得除了递归以外，也可以通过两个遍历，最外层为遍历单词的个数，移动单词长度，
 *  最内层循环为对每一个单词的进行Trie树的匹配；
 *
 *  @param s     <#s description#>
 *  @param start <#start description#>
 *  @param end   <#end description#>
 *
 *  @return <#return value description#>
 */
bool isSubString1(string& s, int start, int end) {
    TrieNode* node = trie->getRoot();
    int idx;
    for (int i = start; i < end; i++) {
        idx = s[i] - 'a';
        if (!node->child[idx]) {
            return false;
        }
        node = node->child[idx];
        // 表明已经达到单词的末尾
        if (node->cnt > 0) {
            node->cnt--; //标记为已经使用
            if (i + 1 == end || isSubString1(s, i+1, end)) {
                node->cnt++; //标记为未使用
                return true;
            }
            node->cnt++; //标记为未使用
        }
    }
    return false;
}

/**
 *  这个方法比较巧妙，利用trie树去匹配字符串中的所有单词
 *
 *  @param s     <#s description#>
 *  @param words <#words description#>
 *
 *  @return <#return value description#>
 */
vector<int> findSubstring4(string s, vector<string>& words) {
    trie = new Trie();
    trie->buildTrie(words);
    int length = (int)words[0].size() * words.size();
    
    vector<int> result;
    for (int i = 0; i < s.length() - length; i++) {
        if (isSubString1(s, i, i+length)) {
            result.push_back(i);
        }
    }
    return result;
}

总结

我觉得无论是什么方法，都逃不掉对字符串的遍历，对单词的匹配，就是看这个过程可以进行多大的优化。