Trie实现(C++)

参考了本文:http://www.cnblogs.com/xulb597/archive/2012/07/05/2578562.html

  • 支持模糊搜索,比如,【bkmh】可以匹配【BuKaManHua】;
  • 支持优先级,首字母、大小字母有更高的优先级。

亟需解决的问题:

  • 目前搜索结果与关键词中字母的顺序无关,即【buk】可以匹配【BKManHua】
  • 若条目中存在重复的任意关键字,即使不包含其他关键字,仍然能匹配上来
  • 内存的占用:因为接触C++不久,内存管理一窍不通,一个1.65MB的文件(312964个单词),索引之后程序(VS2013编译,Release版本)的内存有68 860KB,

trie.hpp

#ifndef TRIE
#define TRIE

#include "LinkedList.hpp"
#include "MatchInfo.hpp"

#include <stdlib.h>
#include <iostream>
#include <string>

#define BRANCH_SIZE 28
#define START_CHAR 'a'

#define INDEX(x) (x == '?'? 27 : (x == '*'? 26 : x - START_CHAR))
        
class Trie
{
public:
    Trie()
    {
        rootNode = new Trie_Node(0);
        memset(nodeMap, NULL, sizeof(nodeMap));
        memset(indexList, NULL, sizeof(indexList));
    }

    ~Trie()
    {
        //delete rootNode;
    }

    void insert(const char *data, const int i)
    {
        bool flag_start = false,flag_capital = false;
        Trie_Node *location = rootNode;
        int pos = 0;
        while (*data)
        {
            char c = *data;
            // check wether it's capital and convert to lowwer case if so.
            if(c > 'A'-1 && c < 'Z'+1)
            {
                flag_capital = true;
                c += 32;
            }else{
                flag_capital = false;
            }
            // map the char value to int which starts from 0
            int index = INDEX(c);
            // skip invalid chars
            if(index < 0)
            {
                data++;
                pos++;
                continue;
            }
            // find next
            if(location->next[index] == NULL)
            {
                location->next[index] = getNode(index);
            }
            location = location->next[index];

            // build MatchInfo and add it to the trie node's indexList
            MatchInfo *info = new MatchInfo();
            info->itemindex = i;
            info->position = pos;    // position of the char in the string
            info->priority = 1;

            // intial or capital char has a higher priority
            if(!flag_start)
            {
                flag_start = true;
                info->priority++;
            }
            if(flag_capital)
                info->priority++;
            if(indexList[index] == NULL)
                indexList[index] = new LinkedList<MatchInfo>();
            indexList[index]->add(info);
            data++;
            pos++;
        }
        // end character has a higher priority
        //location->indexList->getCurrent()->value->priority++;
    }

    /*int match(const char *data)
    {
        Trie_Node *location = rootNode;
        while (*data && location)
        {
            location = location->next[INDEX(*data)];
            data++;
        }
        return (location != NULL);
    }*/

    /*void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength)
    {
        predicateIndexMap(data, indexMap, indexMapLength);
        
        int index;
        Trie_Node *location = rootNode;
        while (*data && location)
        {
            index = INDEX(*data);
            location = location->next[INDEX(*data)];
            if(location != NULL)
            {
                fillIndexArray(indexMap, index);
            }
            data++;
        }
    }*/

    void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength)
    {
        predicateIndexMap(data, indexMap, indexMapLength);
        
        int index;
        Trie_Node *location = nodeMap[INDEX(*data)];
        do
        {
            index = INDEX(*data);
            if(location != NULL)
            {
                fillIndexArray(indexMap, index);
            }
            else
            {
                break;
            }
            data++;
        } while ((*data) && (location = nodeMap[index]));
    }

    /*void print()
    {
        print(rootNode);
    }*/

private:
    //
    // a list to record matche info of each char in indexed words.
    // it's for priority and fuzzy seaching.
    //
    LinkedList<MatchInfo>* indexList[BRANCH_SIZE];

    struct Trie_Node
    {
        //int index;
        Trie_Node *next[BRANCH_SIZE];
        
        Trie_Node(int _index)
        {
            //index = _index;
            memset(next, NULL, sizeof(next));
        };
        ~Trie_Node()
        {
            //delete indexList;
            for (int i = 0; i < BRANCH_SIZE; i++)
            {
                if(next[i])
                    delete next[i];
            }
        }
    };

    Trie_Node *rootNode;

    //
    // a map to hold all created Trie_Node.
    //
    Trie_Node *nodeMap[BRANCH_SIZE];

    //
    // /*get a trie node from map.*/
    // return a new Trie_Node;
    // index: (char - 'a')
    //
    Trie_Node *getNode(int index)
    {
        //return new Trie_Node(index);
        Trie_Node *tempNode = nodeMap[index];
        if(tempNode == NULL)
        {
            tempNode = new Trie_Node(index);
            nodeMap[index] = tempNode;
        }
        return tempNode;
    }

    //
    // fill [indexMap] with priority of char at [index]
    //
    void fillIndexArray(int* indexMap, int index)
    {
        if(indexList[index] == NULL)
            indexList[index] = new LinkedList<MatchInfo>();
        LinkedList<MatchInfo> *list = indexList[index];
        Node<MatchInfo> *node = list->getRoot();
        while (node)
        {
            int itemIndex = node->value->itemindex;
            if(indexMap[itemIndex] != -1)
                indexMap[itemIndex] += node->value->priority;
            node = node->next;
        }
    }

    //
    // keep moving node to next until it's itemindex in value has been changed.
    // node will set to NULL if reaches the end.
    //
    void moveToNextItemIndex(Node<MatchInfo> **node)
    {
        int index = (*node)->value->itemindex;
        if((*node)->next == NULL)
            (*node) = NULL;
        else
        {
            while ((*node)->value->itemindex == index)
            {
                (*node)=(*node)->next;
                if((*node) == NULL)
                    break;
            }
        }
    }

    //
    // predicate whether an index in indexMap is impossiable to be matched.
    // It will be set to -1 if so.
    //
    void predicateIndexMap(const char* keyword, int* indexMap, size_t indexMapLength)
    {
        int *indexesMatched = new int[indexMapLength];
        int keywordLength = strlen(keyword);
        unsigned int keywordRecords[BRANCH_SIZE];
        size_t size = indexMapLength * sizeof(int);
        memset(indexesMatched, 0, size);
        memset(indexMap, -1, size);
        LinkedList<MatchInfo> *list;
        Node<MatchInfo> *match_node;
        int charIndex, index = 0;
        while (*keyword)
        {
            charIndex = INDEX(*keyword);
            if(keywordRecords[charIndex] == 1)
            {
                keyword++;
                continue;
            }
            keywordRecords[charIndex] = 1;
            list = indexList[charIndex];
            if(list != NULL)
            {
                match_node = list->getRoot();
                while (match_node != NULL)
                {
                    indexesMatched[match_node->value->itemindex]++;
                    match_node = match_node->next;
                    //moveToNextItemIndex(&match_node);
                }
            }
            keyword++;
        }
        for (int i = 0; i < indexMapLength; i++)
        {
            if(indexesMatched[i] >= keywordLength)
                indexMap[i] = 0;
        }
        delete indexesMatched;
    }

    /*void print(Trie_Node* node)
    {
        char c;
        for (int i = 0; i < BRANCH_SIZE; i++)
        {
            if(node->next[i] != NULL)
{
                c = node->index + 'a';
                printf("%c-", c);
                print(node->next[i]);
            }
        }
    }*/
};
#endif // TRIE
View Code

LinkedList.hpp

#ifndef LINKEDLIST
#define LINKEDLIST

#include <stdlib.h>
#include <iostream>

template <class T>
struct Node
{
    T* value;
    int index;
    Node *next;
    Node(T* _value, int _index)
    {
        value = _value;
        index = _index;
    }
    ~Node()
    {
        delete value;
    }
};

template <class T>
class LinkedList
{
public:
    int length;
    LinkedList()
    {
        length = 0;
        root = new Node<T>(NULL, 0);
        current = root;
    };
    ~LinkedList()
    {
        Node<MatchInfo> *node = root;
        Node<MatchInfo> *tmp;
        while (node)
        {
            tmp = node->next;
            delete node;
            node = tmp;
        }
    };
    void add(T *value)
    {
        if(length == 0)
        {
            root->value = value;
            root->index = 0;
        }
        else
        {
            current->next = new Node<T>(value, current->index + 1);
            current = current->next;
        }
        length++;
        current->next = NULL;
    };
    Node<T> getAt(int index)
    {
        Node<T> *node = root;
        while (node)
        {
            if(node->index == index)
                return node;
            node = node->next;
        }
        return NULL;
    }
    Node<T> *getRoot()
    {
        return root;
    }

    Node<T> *getCurrent()
    {
        return current;
    }
private:
    Node<T> *root,*current;
};


#endif // LINKEDLIST
View Code

MatchInfo.hpp

#ifndef DEFINE_MatchInfo
#define DEFINE_MatchInfo
//
// 字符的匹配信息
//
struct MatchInfo
{
    // 所在条目的序号
    int itemindex;
    // 所在的位置
    int position;
    // 优先级
    int priority;
};
#endif
View Code

SortBiTree.hpp

#ifndef DEFINE_SortBiTree
#define DEFINE_SortBiTree

#include <stdlib.h>
#include <iostream>
#include <string>

template <class T>
struct BTNode
{
    int index;
    T value;
    BTNode<T> *left,*right;
    BTNode(int _i, T _v)
    {
        index = _i;
        value = _v;
        left = NULL;
        right = NULL;
    }
};

template <class T>
class SortBiTree
{
public:
    SortBiTree()
    {
        root = NULL;
    }
    ~SortBiTree()
    {
    }

    void add(int index, T value)
    {
        BTNode<T> *node = root;
        if(root == NULL)
            root = new BTNode<T>(index, value);
        else
        {
            add_iter(root, index, value);
        }
    }

    BTNode<T> *getMaxNode()
    {
        return maxNode;
    }

    BTNode<T> *getRootNode()
    {
        return root;
    }
private:
    BTNode<T> *root,*minNode,*maxNode;
    void add_iter(BTNode<T>* node, int index, int value)
    {
        if(index > node->index)
        {
            if(node->left != NULL)
            {
                add_iter(node->left, index, value);
            }
            else
            {
                node->left = new BTNode<T>(index, value);
                maxNode = node->left;
            }
        }
        else
        {
            if(node->right != NULL)
            {
                add_iter(node->right, index, value);
            }
            else
            {
                node->right = new BTNode<T>(index, value);
                minNode = node->right;
            }
        }
    }
};
#endif
View Code

Stopwatch.hpp

#ifndef STOPWATCH_DEF
#define STOPWATCH_DEF
#include <ctime>

class Stopwatch
{
public:
    Stopwatch(){};
    ~Stopwatch(){};
    static Stopwatch* StartNew()
    {
        Stopwatch* sw = new Stopwatch();
        sw->startTime = clock();
        return sw;
    }

    double Stop()
    {
        double t = (double)(clock() - startTime)/CLOCKS_PER_SEC;
        delete this;
        return t;
    }
private:
    clock_t startTime;
};
#endif
View Code

main.cpp

#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream>
#include <sys/stat.h>

#include "Stopwatch.hpp"
#include "trie.hpp"
#include "SortBiTree.hpp"

using namespace std;

int displayMaxItems = 10;

unsigned long get_file_size(const char *path)
{
    unsigned long filesize = -1;    
    struct stat statbuff;
    if(stat(path, &statbuff) < 0){
        return filesize;
    }else{
        filesize = statbuff.st_size;
    }
    return filesize;
}

void readFromFile(const char* path, char** buff)
{
    long length = get_file_size(path);    // 取得文件大小
    if(length == -1){
        cerr << "content file is invalid!" << endl;
        system("quit");
        return;
    }
    FILE *f = fopen(path, "r");
    *buff = new char[length];
    fread(*buff, sizeof(char), length, f);
    fclose(f);
}

void readInlineChars(char* source, char** buff)
{
    *buff = new char[strlen(source)];
    strcpy(*buff, source);
}

//
// 用于显示结果
// TODO:使用迭代器
//
void inorder_traverse(BTNode<int> *node, char* words[], int* indexMap, int threshold) {
    if(displayMaxItems == 0)
        return;
    if (NULL != node->left) {
        inorder_traverse(node->left, words, indexMap, threshold);
    }
    int confidence = indexMap[node->value];
    if(confidence > threshold)
    {
        printf("%i	%i	%s
", node->value, confidence, words[node->value]);
        displayMaxItems--;
    }
    if (NULL != node->right) {
        inorder_traverse(node->right, words, indexMap, threshold);
    }
}

void main(char* argv){
    Trie t;
    SortBiTree<int> bt;
    char* cpy = new char[];
    //=====================================
    // 注意,大文件请自行修改,去掉words
    //=====================================
    char *words[256];

    printf("indexing...");
    //readFromFile("contents.txt", &cpy);
    readFromFile("app_list.txt", &cpy);
    //readInlineChars("PlayShangDian PhotoshopDesigner pho Pho BuKaManHua BKManHua", &cpy);
    //readInlineChars("abc acc caa acb aaa abb", &cpy);
    //readInlineChars("aac aaa", &cpy);

    char *tk = strtok(cpy, " ");
    
    size_t index = 0;
    Stopwatch *sw = Stopwatch::StartNew();
    t.insert(tk, index++);
    while (tk = strtok(NULL, " "))
    {
        words[index] = tk;
        t.insert(tk, index++);
    }

    printf("%i word(s) have been indexed. [%lf seconds]
", index, sw->Stop());
label_enter:
    printf("I'm searching for:
>");

    string input;
    const char *chars;
    while (true)
    {
        cin >> input;
        chars = input.data();
        if(!isalpha(*chars) || (*chars <= 'Z' && *chars >= 'A'))
        {
            printf("only lowwer character is accepted!
>");
        }
        else
            break;
    }
    
    int *indexMap = new int[index];
    sw = Stopwatch::StartNew();
    printf("searching...");
    t.fuzzy_match(chars, indexMap, index);
    printf("done. [%lf seconds]
", sw->Stop());
label_change:
    bt = SortBiTree<int>();
    printf("please input the threshold:
>");
    int threshold = 0;
    cin >> threshold;

    printf("calculating...");
    int count = 0;
    for (int i = 0; i < index; i++)
    {
        int confidence = indexMap[i];
        if(confidence > threshold)
        {
            count++;
            bt.add(confidence, i);
        }
    }
    printf(" [%i] words matched.
", count);
    displayMaxItems = 10;
    if(count > displayMaxItems)
        printf("first 10 items are listed below.
");
    goto label_display;

label_display:
    printf("------------------------------------------
");
    printf("index	priority	content
");
    printf("------------------------------------------
");
    if(bt.getRootNode() != NULL)
        inorder_traverse(bt.getRootNode(), words, indexMap, threshold);
    else
        printf("                none                      
");
    /*it = &bt.getIterator();
    while (it->hasNext())
    {
        int i = it->next();
        int confidence = indexMap[i];
        if(confidence > threshold)
        {
            printf("%s	%i	%i
",words[i], confidence, i);
        }
    }*/
    printf("------------------------------------------
");
label_menu:
    printf("now you may want to : 
[1].See them all.
[2].Change thresgold
[3].Change keyword.
[q].Exit
>");
    char choise = 0;
    cin >> choise;
    switch (choise)
    {
    case '1':
        displayMaxItems = -1;
        goto label_display;
        break;
    case '2':
        goto label_change;
        break;
    case '3':
        goto label_enter;
        break;
    case 'q':
        break;
    default:
        break;
    }

    //delete indexMap;
}
View Code

截图:

原文地址:https://www.cnblogs.com/ornithopter/p/3732496.html