a tutorial on computer science

C++博客 :: 首頁 :: 新隨筆 :: 聯系 :: 聚合

:: 管理 ::

21 隨筆 :: 0 文章 :: 17 評論 :: 0 Trackbacks

<

2025年11月

>

日

一

二

三

四

五

六

26

27

28

29

30

31

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

1

2

3

4

5

6

常用鏈接

留言簿(1)

隨筆檔案

friends

遠行

搜索

閱讀排行榜

評論排行榜

hash初步

   hash算法一直被我認為成一種處理大數據量的高效算法(時間復雜度)。
   從一道百度面試題開始。
    搜索引擎會通過日志文件把用戶每次檢索使用的所有檢索串都記錄下來，每個查詢串的長度為1-255字節。
    假設目前有一千萬個記錄（這些查詢串的重復度比較高，雖然總數是1千萬，但如果除去重復后，不超過3百萬個。一個查詢串的重復度越高，說明查詢它的用戶越多，也就是越熱門。），請你統計最熱門的10個查詢串，要求使用的內存不能超過1G。

   好。首先想暴力解決下，看看內存夠不夠。大約255X10^8B內存，2.4G的樣子。。超內存了。。汗。。300萬，那就是0.8G，剛剛好。很自然的，我們可以想到，如果每次向內存讀一個字符串，然后把那個字符串出現的次數和字符串存起來，這樣，就可以在不超過1G的情況下搞定。編程珠璣上面有這道題目的int版本，大概就是問10億個整數（從1到10億，缺一個），不超過多少內存，要求最快找出缺少的數。那個題目也是利用hash的思想，不過它的hash函數就是它自己就是了。開一個10億比特的內存，然后把flag[num]設置一下，最后再統計一下。好吧，這個題目是不是可以利用類似的思想呢？好吧，hash吧。
   hash算法的基本步驟是：把數據存放到key(data[i])里面。如此簡單。就是建立data[i]和i的映射關系，然后利用數組可以隨機訪問的特點，使得在O(1)的時間復雜度再次找到數據(理想情況，可能沖突)！hash最直接的利用就是lookup table,查找表。建立一個hash表，然后可以進行快速查找。（如果出現訪問沖突怎么辦呢？大致分為兩種辦法：開散列和閉散列。開散列就是找到了這個位置被別人占了，好，找個規則換地方。閉散列就是這個地方被別人站著，我跟在他后面(鏈表)。高深的玩意研究不懂，MARK之，以后慢慢看）。
   hash解決此題：網上找一個字符串hash函數看看先(看不懂，直接用。哪位大神可以告訴我為什么或者詳細資料??)。建立一個空的hash表，每次讀一個字符串。找到這個字符串的key（就是用hash函數對它XXX）,返回一個位置?？纯茨莻€位置是不是被別人占了。如果被別人占了，我就往后走，直到找到一個空位子。坐下。當然這個過程也許會找到和自己一樣的，那樣就把它的訪問次數+1。好了，hash表建好了，里面有300萬個字符串，每一個字符串的搜索次數也統計出來了。
   問題完成了第一步。
第二部是，統計TOP K字符串。這個。?？梢耘艂€序，qsort,O(N*logN),太挫了。果斷用個小頂堆，把復雜度降到O(N*log(K))，K 很小，這個很劃算啊。
關于堆的問題就不詳細闡述了，實現簡單(siftdown(int),siftup(int))，目的明確(取最值，增加刪除元素)。下面是測試的代碼。當然我沒有那么大的數據量，寫的代碼也僅供測試之用。

#include <stdio.h>

#include <string.h>

#define MAXN 47

#define NUM 10

typedef struct

{

char str[256];

int time;

}node;

node data[MAXN];

node heap[NUM];//小頂堆

int hcount = 0;

void swap(node& a,node& b)

{

node tmp;

tmp = a;

a = b;

b = tmp;

}

void siftdown(int i)

{

int minst = i;

if(2*i<=hcount&&heap[i].time>heap[2*i].time)

minst = 2*i;

if(2*i+1<=hcount&&heap[minst].time>heap[2*i+1].time)

minst = 2*i+1;

swap(heap[i],heap[minst]);

if(i!=minst)

{

siftdown(minst);

}

void siftup(int i)

{

while(heap[i].time<heap[i/2].time)

{

swap(heap[i],heap[i/2]);

siftup(i);

}

void pop()

{

if(hcount<=0)

return;

swap(heap[1],heap[hcount]);

hcount--;

siftdown(1);

}

void add(node n)

{

if(hcount<NUM)

{

data[hcount++] = n;

siftup(hcount);

return;

}

if(heap[0].time<n.time)

{

pop();

data[hcount++] = n;

siftup(hcount);

return;

}

int strhash(char* str)

{

//BKDRHash

int seed = 131;

int hash = 0;

while(*str)

{

hash = hash *seed + (*str++);

}

return (hash & 0x7FFFFFFF);

}

void init()

{

int i;

for(i=0;i<MAXN;i++)

data[i].time=-1;

}

void solve()

{

int i;

for(i=0;i<MAXN;i++)

{

if(data[i].time>=0)

{

add(data[i]);

}

//輸出heap

for(i=0;i<NUM;i++)

{

printf("%s %d\n",data[i].str,data[i].time);

}

int main()

{

init();

int index;

char str[256];

freopen("in.txt","r",stdin);

freopen("out.txt","w",stdout);

while(scanf("%s",str)!=EOF)

{

index = strhash(str);

index = index%MAXN;

//找一個沒放的或者和它相同的

while(data[index].time != -1 && strcmp(data[index].str,str) != 0)

{

index++;

index%=MAXN;

}

if(data[index].time == -1)

{

strcpy(data[index].str,str);

data[index].time = 1;

}

else

{

data[index].time++;

}

#include <stdio.h>

#include <string.h>

#include <stdlib.h>

#define DEBUG

#define MAXN 7997997

typedef struct _node

{

int num;

int time;

struct _node* next;

}node;

node zhash[MAXN],fhash[MAXN];

int A[5000],B[5000],C[5000],D[5000];

void init(int n)

{

int i;

for(i=0;i<n;i++)

{

zhash[i].time = -1;

fhash[i].time = -1;

zhash[i].next = NULL;

fhash[i].next = NULL;

}

void insert(int num)

{

node* h;

if(num >= 0)

h = zhash;

else

h = fhash;

int index = abs(num)%MAXN;

if(h[index].time==-1)

{

h[index].time = 1;

h[index].num = num;

}

else

{

node* p = &h[index];

while(p!=NULL && p->num!=num)

p = p->next;

if(p != NULL)

{

p->time++;

}

else

{

p = (node*)malloc(sizeof(node));

p->num = num;

p->time = 1;

p->next =NULL;

}

int getres(int num)

{

node* h;

if(num <= 0)

h = zhash;

else

h = fhash;

int index = abs(num)%MAXN;

node* p = &h[index];

while(p!=NULL && p->num!=(num*(-1)))

{

p = p->next;

}

if(p == NULL)

return 0;

else

return p->time;

}

int main()

{

int i,j,count,res=0,tmp;

scanf("%d",&count);

init(MAXN);

for(i=0;i<count;i++)

{

scanf("%d%d%d%d",&A[i],&B[i],&C[i],&D[i]);

}

for(i=0;i<count;i++)

for(j=0;j<count;j++)

{

tmp = A[i]+B[j];

insert(tmp);

}

for(i=0;i<count;i++)

for(j=0;j<count;j++)

{

tmp = C[i]+D[j];

res += getres(tmp);

}

printf("%d\n",res);

#ifdef DEBUG

scanf("%d",&i);

#endif

return 0;

}

solve();

return 0;

}

繼續hash算法。
其實本來是想搞ACM的hash的，苦于各種找不到資料。
POJ2785。http://poj.org/problem?id=2785
下面代碼沒AC。
題目自己看吧，思路是正數一個hash表，負數一個hash表，然后把O(N^4)復雜度搞成O(N^2)。上面玩的是開散列。下面是閉散列。無代碼規范代碼。

posted on 2011-11-15 19:57 bigrabbit 閱讀(6012) 評論(4) 編輯收藏引用

對于關鍵字為字符串類型的散列函數，其本質和整數模散列差不多，對7位acsii碼的字符串，先把它轉到對應的整數，比如"abcd"，對應的整數為97*128^3+98*128^2+99*128^1+100（128為基數），考慮到字符串長度，上面的公式計算的結果可能會溢出，因此根據mod函數的性質及霍納算法，可以改進為：((((((97%M)*128+98)%M)*128+99)%M)*128+100)%M，你的strhash實現是累加求和，131為因子，最終結果再取31位而得散列值，而131、31都是素數，素數有處于減少沖突。回復更多評論

# re: hash初步 2011-11-20 18:34 bigrabbit

@春秋十二月
原來ELFhash做的是這個事情。。。請問有沒有什么關于hash深入一些全面一些的資料？回復更多評論

刷新評論列表

只有注冊用戶登錄后才能發表評論。




網站導航: 博客園 IT新聞 BlogJava 博問 Chat2DB 管理

青青草原综合久久大伊人导航_色综合久久天天综合_日日噜噜夜夜狠狠久久丁香五月_热久久这里只有精品

a tutorial on computer science

常用鏈接

留言簿(1)

隨筆檔案

friends

搜索

最新評論

閱讀排行榜

評論排行榜

評論