學著站在巨人的肩膀上

金融數(shù)學,InformationSearch,Compiler,OS,

C++博客 :: 首頁 :: 新隨筆 :: 聯(lián)系 :: 聚合

:: 管理 ::

12 隨筆 :: 0 文章 :: 8 評論 :: 0 Trackbacks

<

2009年12月

>

日

一

二

三

四

五

六

29

30

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

1

2

3

4

5

6

7

8

9

公告

發(fā)布經(jīng)典原創(chuàng)文章

常用鏈接

留言簿(1)

隨筆分類

中文文本信息處理(9) (rss)

隨筆檔案

搜索

閱讀排行榜

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

上一篇主要介紹了倒排索引建立相關的文件及中間文件。
TSE建立索引在運行程序上的大致步驟可以簡化分為以下幾步：

1、運行命令#./DocIndex
會用到一個文件 tianwang.raw.520 //爬取回來的原始文件，包含多個網(wǎng)頁的所有信息，所以很大，這也是一個有待解決的問題，到底存成大文件（如果過大會超過2G或4G的限制，而且文件過大索引效率過低）還是小文件（文件數(shù)過多用于打開關閉文件句柄的消耗過大）還有待思考，還就是存儲方案的解決最終肯定是要存為分布式的，最終總文件量肯定是會上TB的，TSE只支持小型的搜索引擎需求。
會產(chǎn)生一下三個文件 Doc.idx, Url.idx, DocId2Url.idx //Data文件夾中的Doc.idx DocId2Url.idx和Doc.idx

2、運行命令#sort Url.idx|uniq > Url.idx.sort_uniq //Data文件夾中的Url.idx.sort_uniq
會用到一個文件 Url.idx文件 //md5 hash 之后的url完整地址和document id值對
會產(chǎn)生一個文件 Url.idx.sort_uniq //URL消重，md5 hash排序，提高檢索效率

3、運行命令#./DocSegment Tianwang.raw.2559638448
會用到一個文件 Tianwang.raw.2559638448 //Tianwang.raw.2559638448為爬回來的文件，每個頁面包含http頭，分詞為后面建立到排索引做準備
會產(chǎn)生一個文件 Tianwang.raw.2559638448.seg //分詞文件，由一行document id號和一行文檔分詞組（只對每個文檔<html></html>中<head></head><body></body>等文字標記中的文本進行分組）構成

4、運行命令#./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //建立獨立的正向索引

5、運行命令
#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort

6、運行命令#./CrtInvertedIdx moon.fidx.sort > sun.iidx //建立倒排索引

我們先從建立索引的第一個程序DocIndex.cpp開始分析。(注釋約定：Tianwang.raw.2559638448是抓回來合并成的大文件，后面就叫大文件，里面包含了很多篇html文檔，里面的文檔有規(guī)律的分隔就叫做一篇一篇的文檔)

//DocIndex.h start-------------------------------------------------------------

#ifndef _COMM_H_040708_
#define _COMM_H_040708_

#include

#include
#include
#include
#include
#include
#include
#include

using namespace std;

const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //前臺搜索結果數(shù)據(jù)集返回條數(shù)

//iceway
//const unsigned MAX_DOC_IDX_ID = 21312; //DocSegment.cpp中要用到
const unsigned MAX_DOC_IDX_ID = 22104;

//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //倒排索引文件
//朱德 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//朱古力 1085 1222

//9萬多條字元文件包括特殊符號，標點，漢字
const string DOC_IDX_NAME("./Data/Doc.idx"); //倒排索引文件
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");

//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0"; //Docindex.cpp中要用到
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0"; //Snapshot.cpp中要用到

//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");

//const string THUMBNAIL_DIR("/ImgSE/timg/");

#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------

#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"

//iceway(mnsc)
#include "Comm.h"
#include

using namespace std;

int main(int argc, char* argv[])
{
    //ifstream ifs("Tianwang.raw.2559638448");
//ifstream ifs("Tianwang.raw.3023555472");
//iceway(mnsc)
ifstream ifs(DOC_FILE_NAME.c_str()); //打開Tianwang.raw.3023555472文件，最原始的文件
if (!ifs)
{
    cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
    return -1;
    }
ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立并打開Url.idx文件
if( !ofsUrl )
{
  cout << "error open file " << endl;
}

ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立并打開Doc.idx文件
if( !ofsDoc )
{
cout << "error open file " << endl;
}

ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立并打開DocId2Url.idx文件
if( !ofsDocId2Url )
{
cout << "error open file " << endl;
}

int cnt=0; //文檔編號從0開始計算
string strLine,strPage;
CUrl iUrl;
CDocument iDocument;
CMD5 iMD5;

int nOffset = ifs.tellg();
while (getline(ifs, strLine))
{
  if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
  {
   nOffset = ifs.tellg();
   continue;
  }

  if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //判斷第一行是否是version: 1.0如果是就解析下去
  {
   if(!getline(ifs, strLine)) break;
   if (!strncmp(strLine.c_str(), "url: ", 4)) //判斷第二行是否是url: 如果是則解析下去
   {
    iUrl.m_sUrl = strLine.substr(5); //截取url: 五個字符之后的url內(nèi)容
    iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //對url用md5 hash處理
    iUrl.m_sChecksum = iMD5.ToString(); //將字符數(shù)組組合成字符串這個函數(shù)在Md5.h中實現(xiàn)

   } else
   {
    continue;
   }

   while (getline(ifs, strLine))
   {
    if (!strncmp(strLine.c_str(), "length: ", 8)) //一直讀下去直到判斷澹澹(相對第五行)惺欠袷莑ength: 是則接下下去
    {
     sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //將該塊所代表網(wǎng)頁的實際網(wǎng)頁內(nèi)容長度放入iDocument數(shù)據(jù)結構中
     break;
    }
   }

getline(ifs, strLine); //跳過相對第六行故意留的一個空行

   iDocument.m_nDocId = cnt; //將文檔編號賦值到iDocument數(shù)據(jù)結構中
   iDocument.m_nPos = nOffset; //文檔結尾在大文件中的結束行號
   char *pContent = new char[iDocument.m_nLength+1]; //新建該文檔長度的字符串指針

   memset(pContent, 0, iDocument.m_nLength+1); //每一位初始化為0
   ifs.read(pContent, iDocument.m_nLength); //根據(jù)獲得的文檔長度讀取澹(其中包含協(xié)議頭)讀取文檔內(nèi)容
   iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
   iDocument.m_sChecksum = iMD5.ToString(); //將字符數(shù)組組合成字符串這個函數(shù)在Md5.h中實現(xiàn)

   delete[] pContent;

   ofsUrl << iUrl.m_sChecksum ; //將md5hash后的url寫入Url.idx文件
   ofsUrl << "\t" << iDocument.m_nDocId << endl; //在一行中一個tab距離分隔，將文件編號寫入Url.idx文件

   ofsDoc << iDocument.m_nDocId ; //將文件編號寫入Doc.idx文件
   ofsDoc << "\t" << iDocument.m_nPos ; //在一行中一個tab距離分隔，將該文檔結束行號澹(同樣也是下一文檔開始行號)寫入Doc.idx文件
   //ofsDoc << "\t" << iDocument.m_nLength ;
   ofsDoc << "\t" << iDocument.m_sChecksum << endl; //在一行中一個tab距離分隔，將md5hash后的url寫入Doc.idx文件

ofsDocId2Url << iDocument.m_nDocId ; //將文件編號寫入DocId2Url.idx文件
ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //將該文檔的完整url寫入DocId2Url.idx文件

cnt++; //文檔編號加一說明該以文檔分析完畢，生成下一文檔的編號
}

nOffset = ifs.tellg();

}

//最后一行只有文檔號和上一篇文檔結束號
ofsDoc << cnt ;
ofsDoc << "\t" << nOffset << endl;

return(0);
}

//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

posted on 2009-12-10 23:00 學者站在巨人的肩膀上閱讀(1352) 評論(1) 編輯收藏引用所屬分類: 中文文本信息處理

只有注冊用戶登錄后才能發(fā)表評論。


相關文章: 自頂向下學搜索引擎——北大天網(wǎng)搜索引擎TSE分析及完全注釋[6]倒排索引的建立的程序分析(4) 自頂向下學搜索引擎——北大天網(wǎng)搜索引擎TSE分析及完全注釋[6]倒排索引的建立的程序分析(3) 自頂向下學搜索引擎——北大天網(wǎng)搜索引擎TSE分析及完全注釋[6]倒排索引的建立的程序分析(2) 自頂向下學搜索引擎——北大天網(wǎng)搜索引擎TSE分析及完全注釋[6]倒排索引的建立的程序分析(1) 自頂向下學搜索引擎——北大天網(wǎng)搜索引擎TSE分析及完全注釋[5]倒排索引的建立及文件介紹自頂向下學搜索引擎——北大天網(wǎng)搜索引擎TSE分析及完全注釋[4]小結自頂向下學搜索引擎——北大天網(wǎng)搜索引擎TSE分析及完全注釋[3]來到關鍵字分詞及相關性分析程序自頂向下學搜索引擎——北大天網(wǎng)搜索引擎TSE分析及完全注釋[2]路過查詢處理程序自頂向下學搜索引擎——北大天網(wǎng)搜索引擎TSE分析及完全注釋[1]尋找搜索引擎入口

網(wǎng)站導航: 博客園 IT新聞 BlogJava 博問 Chat2DB 管理

青青草原综合久久大伊人导航_色综合久久天天综合_日日噜噜夜夜狠狠久久丁香五月_热久久这里只有精品