a tutorial on computer science

C++博客 :: 首頁 :: 新隨筆 :: 聯(lián)系 :: 聚合

:: 管理 ::

21 隨筆 :: 0 文章 :: 17 評論 :: 0 Trackbacks

<

2012年4月

>

日

一

二

三

四

五

六

25

26

27

28

29

30

31

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

1

2

3

4

5

常用鏈接

留言簿(1)

隨筆檔案

friends

遠行

搜索

閱讀排行榜

評論排行榜

hash初步

   hash算法一直被我認為成一種處理大數(shù)據(jù)量的高效算法(時間復(fù)雜度)。
   從一道百度面試題開始。
    搜索引擎會通過日志文件把用戶每次檢索使用的所有檢索串都記錄下來，每個查詢串的長度為1-255字節(jié)。
    假設(shè)目前有一千萬個記錄（這些查詢串的重復(fù)度比較高，雖然總數(shù)是1千萬，但如果除去重復(fù)后，不超過3百萬個。一個查詢串的重復(fù)度越高，說明查詢它的用戶越多，也就是越熱門。），請你統(tǒng)計最熱門的10個查詢串，要求使用的內(nèi)存不能超過1G。

   好。首先想暴力解決下，看看內(nèi)存夠不夠。大約255X10^8B內(nèi)存，2.4G的樣子。。超內(nèi)存了。。汗。。300萬，那就是0.8G，剛剛好。很自然的，我們可以想到，如果每次向內(nèi)存讀一個字符串，然后把那個字符串出現(xiàn)的次數(shù)和字符串存起來，這樣，就可以在不超過1G的情況下搞定。編程珠璣上面有這道題目的int版本，大概就是問10億個整數(shù)（從1到10億，缺一個），不超過多少內(nèi)存，要求最快找出缺少的數(shù)。那個題目也是利用hash的思想，不過它的hash函數(shù)就是它自己就是了。開一個10億比特的內(nèi)存，然后把flag[num]設(shè)置一下，最后再統(tǒng)計一下。好吧，這個題目是不是可以利用類似的思想呢？好吧，hash吧。
   hash算法的基本步驟是：把數(shù)據(jù)存放到key(data[i])里面。如此簡單。就是建立data[i]和i的映射關(guān)系，然后利用數(shù)組可以隨機訪問的特點，使得在O(1)的時間復(fù)雜度再次找到數(shù)據(jù)(理想情況，可能沖突)！hash最直接的利用就是lookup table,查找表。建立一個hash表，然后可以進行快速查找。（如果出現(xiàn)訪問沖突怎么辦呢？大致分為兩種辦法：開散列和閉散列。開散列就是找到了這個位置被別人占了，好，找個規(guī)則換地方。閉散列就是這個地方被別人站著，我跟在他后面(鏈表)。高深的玩意研究不懂，MARK之，以后慢慢看）。
   hash解決此題：網(wǎng)上找一個字符串hash函數(shù)看看先(看不懂，直接用。哪位大神可以告訴我為什么或者詳細資料??)。建立一個空的hash表，每次讀一個字符串。找到這個字符串的key（就是用hash函數(shù)對它XXX）,返回一個位置。看看那個位置是不是被別人占了。如果被別人占了，我就往后走，直到找到一個空位子。坐下。當然這個過程也許會找到和自己一樣的，那樣就把它的訪問次數(shù)+1。好了，hash表建好了，里面有300萬個字符串，每一個字符串的搜索次數(shù)也統(tǒng)計出來了。
   問題完成了第一步。
第二部是，統(tǒng)計TOP K字符串。這個。。可以排個序，qsort,O(N*logN),太挫了。果斷用個小頂堆，把復(fù)雜度降到O(N*log(K))，K 很小，這個很劃算啊。
關(guān)于堆的問題就不詳細闡述了，實現(xiàn)簡單(siftdown(int),siftup(int))，目的明確(取最值，增加刪除元素)。下面是測試的代碼。當然我沒有那么大的數(shù)據(jù)量，寫的代碼也僅供測試之用。

#include <stdio.h>

#include <string.h>

#define MAXN 47

#define NUM 10

typedef struct

{

char str[256];

int time;

}node;

node data[MAXN];

node heap[NUM];//小頂堆

int hcount = 0;

void swap(node& a,node& b)

{

node tmp;

tmp = a;

a = b;

b = tmp;

}

void siftdown(int i)

{

int minst = i;

if(2*i<=hcount&&heap[i].time>heap[2*i].time)

minst = 2*i;

if(2*i+1<=hcount&&heap[minst].time>heap[2*i+1].time)

minst = 2*i+1;

swap(heap[i],heap[minst]);

if(i!=minst)

{

siftdown(minst);

}

void siftup(int i)

{

while(heap[i].time<heap[i/2].time)

{

swap(heap[i],heap[i/2]);

siftup(i);

}

void pop()

{

if(hcount<=0)

return;

swap(heap[1],heap[hcount]);

hcount--;

siftdown(1);

}

void add(node n)

{

if(hcount<NUM)

{

data[hcount++] = n;

siftup(hcount);

return;

}

if(heap[0].time<n.time)

{

pop();

data[hcount++] = n;

siftup(hcount);

return;

}

int strhash(char* str)

{

//BKDRHash

int seed = 131;

int hash = 0;

while(*str)

{

hash = hash *seed + (*str++);

}

return (hash & 0x7FFFFFFF);

}

void init()

{

int i;

for(i=0;i<MAXN;i++)

data[i].time=-1;

}

void solve()

{

int i;

for(i=0;i<MAXN;i++)

{

if(data[i].time>=0)

{

add(data[i]);

}

//輸出heap

for(i=0;i<NUM;i++)

{

printf("%s %d\n",data[i].str,data[i].time);

}

int main()

{

init();

int index;

char str[256];

freopen("in.txt","r",stdin);

freopen("out.txt","w",stdout);

while(scanf("%s",str)!=EOF)

{

index = strhash(str);

index = index%MAXN;

//找一個沒放的或者和它相同的

while(data[index].time != -1 && strcmp(data[index].str,str) != 0)

{

index++;

index%=MAXN;

}

if(data[index].time == -1)

{

strcpy(data[index].str,str);

data[index].time = 1;

}

else

{

data[index].time++;

}

#include <stdio.h>

#include <string.h>

#include <stdlib.h>

#define DEBUG

#define MAXN 7997997

typedef struct _node

{

int num;

int time;

struct _node* next;

}node;

node zhash[MAXN],fhash[MAXN];

int A[5000],B[5000],C[5000],D[5000];

void init(int n)

{

int i;

for(i=0;i<n;i++)

{

zhash[i].time = -1;

fhash[i].time = -1;

zhash[i].next = NULL;

fhash[i].next = NULL;

}

void insert(int num)

{

node* h;

if(num >= 0)

h = zhash;

else

h = fhash;

int index = abs(num)%MAXN;

if(h[index].time==-1)

{

h[index].time = 1;

h[index].num = num;

}

else

{

node* p = &h[index];

while(p!=NULL && p->num!=num)

p = p->next;

if(p != NULL)

{

p->time++;

}

else

{

p = (node*)malloc(sizeof(node));

p->num = num;

p->time = 1;

p->next =NULL;

}

int getres(int num)

{

node* h;

if(num <= 0)

h = zhash;

else

h = fhash;

int index = abs(num)%MAXN;

node* p = &h[index];

while(p!=NULL && p->num!=(num*(-1)))

{

p = p->next;

}

if(p == NULL)

return 0;

else

return p->time;

}

int main()

{

int i,j,count,res=0,tmp;

scanf("%d",&count);

init(MAXN);

for(i=0;i<count;i++)

{

scanf("%d%d%d%d",&A[i],&B[i],&C[i],&D[i]);

}

for(i=0;i<count;i++)

for(j=0;j<count;j++)

{

tmp = A[i]+B[j];

insert(tmp);

}

for(i=0;i<count;i++)

for(j=0;j<count;j++)

{

tmp = C[i]+D[j];

res += getres(tmp);

}

printf("%d\n",res);

#ifdef DEBUG

scanf("%d",&i);

#endif

return 0;

}

solve();

return 0;

}

繼續(xù)hash算法。
其實本來是想搞ACM的hash的，苦于各種找不到資料。
POJ2785。http://poj.org/problem?id=2785
下面代碼沒AC。
題目自己看吧，思路是正數(shù)一個hash表，負數(shù)一個hash表，然后把O(N^4)復(fù)雜度搞成O(N^2)。上面玩的是開散列。下面是閉散列。無代碼規(guī)范代碼。

posted on 2011-11-15 19:57 bigrabbit 閱讀(6012) 評論(4) 編輯收藏引用

對于關(guān)鍵字為字符串類型的散列函數(shù)，其本質(zhì)和整數(shù)模散列差不多，對7位acsii碼的字符串，先把它轉(zhuǎn)到對應(yīng)的整數(shù)，比如"abcd"，對應(yīng)的整數(shù)為97*128^3+98*128^2+99*128^1+100（128為基數(shù)），考慮到字符串長度，上面的公式計算的結(jié)果可能會溢出，因此根據(jù)mod函數(shù)的性質(zhì)及霍納算法，可以改進為：((((((97%M)*128+98)%M)*128+99)%M)*128+100)%M，你的strhash實現(xiàn)是累加求和，131為因子，最終結(jié)果再取31位而得散列值，而131、31都是素數(shù)，素數(shù)有處于減少沖突。回復(fù) 更多評論

# re: hash初步 2011-11-20 18:34 bigrabbit

@春秋十二月
原來ELFhash做的是這個事情。。。請問有沒有什么關(guān)于hash深入一些全面一些的資料？回復(fù) 更多評論

刷新評論列表

只有注冊用戶登錄后才能發(fā)表評論。




網(wǎng)站導(dǎo)航: 博客園 IT新聞 BlogJava 博問 Chat2DB 管理

青青草原综合久久大伊人导航_色综合久久天天综合_日日噜噜夜夜狠狠久久丁香五月_热久久这里只有精品

a tutorial on computer science

常用鏈接

留言簿(1)

隨筆檔案

friends

搜索

最新評論

閱讀排行榜

評論排行榜

評論