ternary tree / minimal perfect hash

Допустим у нас есть некая таблица(а.к.а. словарь) состоящая из N ключевых слов[1].
Нам нужно написать функцию типа:

  int search(const char* some_word);

которая быстро возвращает либо число из диапазона [1..N] если some_word в таблице или 0 если такого слова там нет.

Задача построения такой функции решаема несколькими вариантами, один из них например
построить т.н. minimal perfect hash function с помощью например gperf (но это возможно не на каждом наборе).

Второй вариант — воспользоваться предлагаемой имплементацией ternary tree [2]:

Класс ternary_tree кроме двух очевидных и полезных методов:

dword ternary_tree::insert(const char* s);
dword ternary_tree::search(const char* s);

имеет также бонус методы:

size_t ternary_tree::partial_match_search(const char *s, array<dword>& result_set );
size_t ternary_tree::neighbour_search(const char *s, int d, array<dword>& result_set);

partial_match_search позволяет находить множество слов из словаря удовлетворяющим шаблону.
например их словаря:

"one" "twa" "ane" "ono" "twas"

partial_match_search("?n?") найдет "one" "ane" "ono".

neighbour_search позволяет находить "похожие" словаю. Мера похожести задается параметром d — HAMMING DISTANCE [3].

например neighbour_search("twa",2) найдет два слова "twa" и "twas"

Замечания по имплементации:
1) Я использую свой array который может быть легко заменен на std::vector
2) Выбранный способ аллокации nodes (элементы в масиве) помимо всего прочего позволяет
задавать compiled static node tables — т.е. если у вас есть статическая таблица keywords то в принципе её можно скомпилировать и разместить в коде как static. Т.е. для проведения search не надо будет делать inserts при инициализации.

Алгоритм известен своей практически идеальной производительностью.
Во всяком случае теоретически лучше чем hash_table's

[1]слово — здесь есть конечная последовательность символов заканчивающаяся нулем)
[2]Теория ternary tree: http://home.od.ua/~relayer/algo/data/tern/lead.htm
[3]HAMMING DISTANCE: http://pespmc1.vub.ac.be/ASC/HAMMIN_DISTA.html

-------------------------------------------
source code:

namespace tool 
{
  template <typename CHAR>
    class ternary_tree
  {
    typedef unsigned int node_index;

  public:
    enum constants 
    {
        any_one_char = '?'
    };

    ternary_tree(): max_no(0) {}

    // 
    // returns number [1..max uint] of inserted or existing item.
    // does no insertion if item already exists. 
    //
    dword insert(const CHAR *s)
    {
      return (dword)nodes[insert(0, s)].eqkid;
    }

    // returns n[1..max_no] or zero if not found
    dword search(const CHAR *s) const
    {   
      node_index nid = 0; // root index;
      while (nid < total_nodes()) 
      {
         const node& n = nodes[nid];
         if (*s < n.splitchar)
             nid = n.lokid;
         else if(*s > n.splitchar)
             nid = n.hikid;
         else //(*s == p->splitchar) 
         {
            if (*s++ == 0)
                return n.value();
            nid = n.eqkid;
         } 
      }
      return 0;
    }
    // partial match search
    // e.g. 
    size_t partial_match_search(const CHAR *s, array<dword>& result_set ) const
    {
      partial_match_search(0,s,result_set);
      return result_set.size();
    }

    //
    // neighbour search 
    //
    // 'd' is Hamming distance of a query word: 
    //      A measure of the difference between two strings, 
    //      expressed by the number of characters that need to be changed to 
    //      obtain one from the other. 
    //      E.g., 0101 and 0110 has a Hamming distance of two 
    //      whereas "Butter" and "ladder" are four characters apart.
    //
    size_t neighbour_search(const CHAR *s, int d, array<dword>& result_set) const
    {
      neighbour_search(0, s, result_set, d);
      return result_set.size();
    }

 protected:
    typedef unsigned int node_index;
    
    struct node 
    {
       CHAR       splitchar;
       node_index lokid, eqkid, hikid;
       node():splitchar(0), lokid(-1), eqkid(-1), hikid(-1){}
       dword value() const 
       {
         assert(splitchar == 0);
         return (dword) eqkid;
       }
       void value(dword v)  
       {
         assert(splitchar == 0);
         eqkid = (node_index)v;
       }
    };

    array<node> nodes;
    dword       max_no;

    size_t      total_nodes() const { return (dword) nodes.size(); }
    
    node_index  insert(node_index nid, const CHAR *s)
    { 
       if (nid >= total_nodes()) 
       {
            node n; n.splitchar = *s;
            nodes.push(n);
            nid = nodes.last_index(); 
       }
       const node& n = nodes[nid];
       if (*s < n.splitchar)
          nodes[nid].lokid = insert(n.lokid, s);
       else if(*s > n.splitchar)
          nodes[nid].hikid = insert(n.hikid, s);
       else //*s == p->splitchar 
       {
          if (*s == 0)
          //"...we'll exploit the fact that a node with a 
          // null splitchar cannot have an eqkid, 
          // and store the data in that field."
              nodes[nid].value(++max_no);
          else
              nodes[nid].eqkid = insert(n.eqkid, ++s);
       }
       return nid;
    }

    //partial match search
    void partial_match_search(node_index nid, const CHAR *s, array<dword>& result_set ) const 
    {    
         if (nid >= total_nodes()) 
           return;
         const node& n = nodes[nid];
         if (*s == any_one_char || *s < n.splitchar)
            partial_match_search(n.lokid, s, result_set);
         if (*s == any_one_char || *s == n.splitchar)
            if (n.splitchar && *s)
                partial_match_search(n.eqkid, s+1, result_set);
         if (*s == 0 && n.splitchar == 0)
                result_set.push( (dword) n.value());
         if (*s == any_one_char || *s > n.splitchar)
                partial_match_search(n.hikid, s, result_set);
    }

    static int  length(const CHAR* s)
    {
      const CHAR* ps = s; while(*s) ++s; return s - ps;
    }

    void neighbour_search(node_index nid, const CHAR *s, array<dword>& result_set, int d) const
    {   
        if (nid >= total_nodes() || d < 0) return;

        const node& n = nodes[nid];      

        if (d > 0 || *s < n.splitchar)
            neighbour_search(n.lokid, s, result_set, d);
        if (n.splitchar == 0) 
        {
           if ( length(s) <= d)
              result_set.push(n.value());
        } 
        else
           neighbour_search(n.eqkid, *s ? s+1:s, result_set, (*s == n.splitchar) ? d:d-1 );
        if (d > 0 || *s > n.splitchar)
           neighbour_search(n.hikid, s, result_set, d);
    }


  };

}

От:	c-smile	http://terrainformatica.com
Дата:	24.09.04 20:25
Оценка:	156 (19)