本文共 7731 字,大约阅读时间需要 25 分钟。
水煮coreutils 之二 hash
在查找效率要求高的系统中,hash是一个永远不变的话题,查找时间为O(1),把大家的眼光深深的锁定在上面再也走不开.天下没有
白吃的午餐,hash表也是视内存入粪土的东东,浪费内存起来也是大把大把的.空间换时间的思想在这里体现的淋漓尽致.
第一次正式的接触这类东东还是在看偶们公司的代码的时候,以前做这类事情都是用Map,但是Map的效率貌似O(log2 n),而且在耗
内存方面也毫不逊色.所以觉得这个东东还是有点先进的.
等到我看到了hash在coreutils的实现,以前的认识觉得是多么肤浅,真是五岳归来不看山了.
先来个hash函数开胃:
#include < limits.h > #define SIZE_BITS (sizeof (size_t) * CHAR_BIT) /* A hash function for NUL-terminated char* strings using the method described by Bruno Haible. See http://www.haible.de/bruno/hashfunc.html. */ size_thash_pjw ( const void * x, size_t tablesize){ const char * s; size_t h = 0 ; for (s = x; * s; s ++ ) h = * s + ((h << 9 ) | (h >> (SIZE_BITS - 9 ))); return h % tablesize;} 性能比较可以查看上面的网站,网上也有一个很好的hash函数效率比较的例子
关键是实现:
hash 函数虽然是核心,但是核心总是寥寥几行代码,根据不同的应用情况设计hash函数,有时候比这种传统的hash函数更加实用.
我们主要关注的是hash的实现.
我们来看一下关键部分,对于这类代码insert 和 delete操作才是精华部分,其它代码就没有解释的必要了.
/* If ENTRY matches an entry already in the hash table, return the pointer to the entry from the table. Otherwise, insert ENTRY and return ENTRY. Return NULL if the storage required for insertion cannot be allocated. */ void * hash_insert (Hash_table * table, const void * entry){ void * data; struct hash_entry * bucket; /* The caller cannot insert a NULL entry. */ if ( ! entry) abort (); /* If there's a matching entry already in the table, return that. */ /* 如果对应的key hash到的值有重复 bucket 会有值的 仍然return NULL 参考hash_find_entry的注释: This private function is used to help with insertion and deletion. When ENTRY matches an entry in the table, return a pointer to the corresponding user data and set *BUCKET_HEAD to the head of the selected bucket. Otherwise, return NULL. When DELETE is true and ENTRY matches an entry in the table, unlink the matching entry. */ if ((data = hash_find_entry (table, entry, & bucket, false )) != NULL) return data; /* ENTRY is not matched, it should be inserted. */ if (bucket -> data) { struct hash_entry * new_entry = allocate_entry (table); if (new_entry == NULL) return NULL; /* Add ENTRY in the overflow of the bucket. */ // 链表上加一个节点 new_entry -> data = ( void * ) entry; new_entry -> next = bucket -> next; bucket -> next = new_entry; table -> n_entries ++ ; return ( void * ) entry; } /* Add ENTRY right in the bucket head. */ // 直接在bucket上加入数据 bucket -> data = ( void * ) entry; table -> n_entries ++ ; table -> n_buckets_used ++ ; /* If the growth threshold of the buckets in use has been reached, increase the table size and rehash. There's no point in checking the number of entries: if the hashing function is ill-conditioned, rehashing is not likely to improve it. */ if (table -> n_buckets_used > table -> tuning -> growth_threshold * table -> n_buckets) { /* Check more fully, before starting real work. If tuning arguments became invalid, the second check will rely on proper defaults. */ check_tuning (table); if (table -> n_buckets_used > table -> tuning -> growth_threshold * table -> n_buckets) { const Hash_tuning * tuning = table -> tuning; // 计算数组大小 float candidate = (tuning -> is_n_buckets ? (table -> n_buckets * tuning -> growth_factor) : (table -> n_buckets * tuning -> growth_factor * tuning -> growth_threshold)); // # define SIZE_MAX ((size_t) -1) 如果candidate 表达的数字大于size_t就退出 if (SIZE_MAX <= candidate) return NULL; /* If the rehash fails, arrange to return NULL. */ // 重新建立hash队列 if ( ! hash_rehash (table, candidate)) entry = NULL; } } return ( void * ) entry;} 代码本身的注释就很丰富,不用太多说明.
/* If ENTRY is already in the table, remove it and return the just-deleted data (the user may want to deallocate its storage). If ENTRY is not in the table, don't modify the table and return NULL. */ void * hash_delete (Hash_table * table, const void * entry){ void * data; struct hash_entry * bucket; // 删除操作在这里就完成了 下面的是进行收缩hash_table的操作 data = hash_find_entry (table, entry, & bucket, true ); if ( ! data) return NULL; table -> n_entries -- ; if ( ! bucket -> data) { table -> n_buckets_used -- ; /* If the shrink threshold of the buckets in use has been reached, rehash into a smaller table. */ if (table -> n_buckets_used < table -> tuning -> shrink_threshold * table -> n_buckets) { /* Check more fully, before starting real work. If tuning arguments became invalid, the second check will rely on proper defaults. */ check_tuning (table); if (table -> n_buckets_used < table -> tuning -> shrink_threshold * table -> n_buckets) { const Hash_tuning * tuning = table -> tuning; size_t candidate = (tuning -> is_n_buckets ? table -> n_buckets * tuning -> shrink_factor : (table -> n_buckets * tuning -> shrink_factor * tuning -> growth_threshold)); hash_rehash (table, candidate); } } } return data;} 我们注意到两个函数都调用了hash_find_entry
static void * hash_find_entry (Hash_table * table, const void * entry, struct hash_entry ** bucket_head, bool delete){ // 计算出entry 在hash_table 中的位置 struct hash_entry * bucket = table -> bucket + table -> hasher (entry, table -> n_buckets); struct hash_entry * cursor; if ( ! (bucket < table -> bucket_limit)) abort (); * bucket_head = bucket; /* Test for empty bucket. */ if (bucket -> data == NULL) return NULL; /* See if the entry is the first in the bucket. */ if (( * table -> comparator) (entry, bucket -> data)) { void * data = bucket -> data; if (delete) { if (bucket -> next) { struct hash_entry * next = bucket -> next; /* Bump the first overflow entry into the bucket head, then save the previous first overflow entry for later recycling. */ * bucket = * next; // free_entry 只是把值插入到table->free_entry_list 这里有效利用了内存 free_entry (table, next); } else { bucket -> data = NULL; } } return data; } /* Scan the bucket overflow. */ for (cursor = bucket; cursor -> next; cursor = cursor -> next) { if (( * table -> comparator) (entry, cursor -> next -> data)) { void * data = cursor -> next -> data; if (delete) { struct hash_entry * next = cursor -> next; /* Unlink the entry to delete, then save the freed entry for later recycling. */ cursor -> next = next -> next; free_entry (table, next); } return data; } } /* No entry found. */ return NULL;} 对于效率要求更高的的程序来说,收缩队列和扩充队列其实都可以不需要,不过这个就要自己去打造了,看这个代码中流露出一种
思想,函数调用的消耗远小于系统调用的消耗,内存分配,文件打开,读写都是相当耗资源的,反过来边界检查,指针变换,内存拷贝的消耗可以
忽略不计.
附录昨天我写的一个hash函数应用的例子:
#define USE_DIFF_HASH 1 #include " ../lib/hash.h " #include " ../lib/hash-pjw.h " #include < stdio.h > #include < stdlib.h > #include < string .h > struct tl{ char s1[ 256 ]; char s2[ 256 ];}; static bool hash_str_cmp( void * s1, void * s2){ return strcmp((( struct tl * )s1) -> s1,(( struct tl * )s2) -> s1) == 0 ;} static size_t hash_fun( const void * s,size_t t){ return hash_pjw((( struct tl * )s) -> s1,t);} struct tl * make_tl( char * s){ struct tl * tmp = ( struct tl * )malloc( sizeof ( struct tl)) ; memset(tmp, 0 , sizeof ( struct tl)) ; char * p = s ; int n = 0 ; while ( * p != ' ' ){ p ++ ;} memcpy(tmp -> s1,s,p - s); while ( * p == ' ' ) { p ++ ; } strcpy(tmp -> s2,p); return tmp ;} int main( int argc , char ** argv){ Hash_table * myhash = hash_initialize ( 25026 ,NULL,hash_fun,hash_str_cmp,free); char str[ 256 ]; struct tl * ptl = NULL ;FILE * fp = fopen( " tl.log " , " r " ); while (NULL != fgets(str, 256 ,fp)){ ptl = make_tl(str); hash_insert(myhash,ptl); // printf("%s 44444 %s ",ptl->s1,ptl->s2); }fclose(fp);printf( " ----------------------------------------------------- " ); struct tl dd ; struct tl * s;strcpy(dd.s1, " 888046 " );s = hash_lookup (myhash, & dd);printf( " %s the value %s " ,dd.s1,s -> s2);hash_print_statistics(myhash,stdout); return 0 ;} 提取的文件
root # [ / root / coreutils- 5.2 . 1 / tltest] more tl . log 888022 881050971 888022 881123701 888026 881122057 888026 881044216 888028 881052006 888028 881160639 888029 881120640 888029 881034717 888031 881161788 888031 881044952 888033 881120535 888033 881024131 转载地址:http://sskmi.baihongyu.com/