久久热这里只有精品在线观看,一个色综合久久,久久久久亚洲AV成人网人人网站

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[6]倒排索引的徏立的�E�序分析(4)

Thu, 10 Dec 2009 15:03:00 GMT

以下是根据正向烦引徏立倒排索引的注�?/p>

int main(int argc, char* argv[])    //./CrtInvertedIdx moon.fidx.sort > sun.iidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo)
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum,tmp1="";
    int cnt = 0;
    while (getline(ifsImgInfo, strLine))
    {
        string::size_type idx;
        string tmp;

idx = strLine.find("\t");
tmp = strLine.substr(0,idx);

if (tmp.size()<2 || tmp.size() > 8) continue;

if (tmp1.empty()) tmp1=tmp;

        if (tmp == tmp1)
        {
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }
        else
        {
            if ( strDocNum.empty() )
                strDocNum = strDocNum + " " + strLine.substr(idx+1);

            cout << tmp1 << "\t" << strDocNum << endl;
            tmp1 = tmp;
            strDocNum.clear();
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }

        cnt++;
        //if (cnt==100) break;
    }
    cout << tmp1 << "\t" << strDocNum << endl; //倒排索引中每个字典单词后的文档编号以table键�ؓ(f��)间隔

return 0;
}

学者站在巨人的肩膀�?/a> 2009-12-10 23:03 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[6]倒排索引的徏立的�E�序分析(2)

Thu, 10 Dec 2009 15:02:00 GMT

前面的DocIndex�E�序输入一个Tianwang.raw.*****文�g�Q�会(x��)产生一下三个文�?Doc.idx, Url.idx, DocId2Url.idx�Q�我们这里对DocSegment�E�序�q�行分析�?/p>

�q�里输入 Tianwang.raw.*****�Q�Doc.idx�Q�Url.idx.sort_uniq�{�三个文�Ӟ��输出一个Tianwang.raw.***.seg 分词完毕的文�?/p>

int main(int argc, char* argv[])
{
    string strLine, strFileName=argv[1];
    CUrl iUrl;
    vector vecCUrl;
    CDocument iDocument;
    vector vecCDocument;
    unsigned int docId = 0;

    //ifstream ifs("Tianwang.raw.2559638448");
    ifstream ifs(strFileName.c_str()); //DocSegment Tianwang.raw.****
    if (!ifs)
    {
        cerr << "Cannot open tianwang.img.info for input\n";
        return -1;
    }

    ifstream ifsUrl("Url.idx.sort_uniq");   //排序�q�消重后的url字典
    if (!ifsUrl)
    {
        cerr << "Cannot open Url.idx.sort_uniq for input\n";
        return -1;
    }
    ifstream ifsDoc("Doc.idx"); //字典文�g
    if (!ifsDoc)
    {
        cerr << "Cannot open Doc.idx for input\n";
        return -1;
    }

    while (getline(ifsUrl,strLine)) //偏离url字典存入一个向量内存中
    {
        char chksum[33];
        int docid;

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%s%d", chksum, &docid );
        iUrl.m_sChecksum = chksum;
        iUrl.m_nDocId = docid;
        vecCUrl.push_back(iUrl);
    }

    while (getline(ifsDoc,strLine))     //偏离字典文�g��其攑օ�一个向量内存中
    {
        int docid,pos,length;
        char chksum[33];

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
        iDocument.m_nDocId = docid;
        iDocument.m_nPos = pos;
        iDocument.m_nLength = length;
        iDocument.m_sChecksum = chksum;
        vecCDocument.push_back(iDocument);
    }

    strFileName += ".seg";
    ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);    //讄��完成分词后的数据输出文�g
    for ( docId=0; docId    {

        // find document according to docId
        int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
        char *pContent = new char[length+1];
        memset(pContent, 0, length+1);
        ifs.seekg(vecCDocument[docId].m_nPos);
        ifs.read(pContent, length);

char *s;
s = pContent;

        // skip Head
        int bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;

        // skip header
        bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;

        //iDocument.m_sBody = s;
        iDocument.RemoveTags(s);    //去除<>
        iDocument.m_sBodyNoTags = s;

delete[] pContent;
string strLine = iDocument.m_sBodyNoTags;

CStrFun::ReplaceStr(strLine, " ", " ");
CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "

        // segment the document 具体分词处理
        CHzSeg iHzSeg;
        strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
        fout << docId << endl << strLine;
        fout << endl;

    }

return(0);
}
�q�里只是��光掠媄式的�q�一遍大概的代码�Q�后面我�?x��)有专题详细讲�?parse html �?segment docment �{�技�?/p>

学者站在巨人的肩膀�?/a> 2009-12-10 23:02 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[6]倒排索引的徏立的�E�序分析(3)

Thu, 10 Dec 2009 15:02:00 GMT

�q�里介绍正向索引的徏立，如果直接建立倒排索引效率上可能会(x��)很低�Q�所以可以先产生正向索引为后面的倒排索引打下基础�?/p>

详细的文件功能和介绍都在�q�里有了介绍自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[5]倒排索引的徏立及(qi��ng)文�g介绍

CrtForwardIdx.cpp文�g

int main(int argc, char* argv[])    //./CrtForwardIdx Tianwang.raw.***.seg > moon.fidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo)
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum;
    int cnt = 0;
    while (getline(ifsImgInfo, strLine))
    {
        string::size_type idx;

        cnt++;
        if (cnt%2 == 1) //奇数行�ؓ(f��)文档�~�号
        {
            strDocNum = strLine.substr(0,strLine.size());
            continue;
        }
        if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
        {
            continue;
        }

        while ( (idx = strLine.find(SEPARATOR)) != string::npos ) //指定查找分界�W?
        {
            string tmp1 = strLine.substr(0,idx);
            cout << tmp1 << "\t" << strDocNum << endl;
            strLine = strLine.substr(idx + SEPARATOR.size());
        }

//if (cnt==100) break;
}

return 0;
}

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

学者站在巨人的肩膀�?/a> 2009-12-10 23:02 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[6]倒排索引的徏立的�E�序分析(1)

Thu, 10 Dec 2009 15:00:00 GMT

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

上一��主要介�l�了倒排索引建立相关的文件及(qi��ng)中间文�g�?br>TSE建立索引在运行程序上的大致步骤可以简化分��Z��下几步：(x��)

1、运行命�?./DocIndex
�?x��)用��C��个文�?tianwang.raw.520 //爬取回来的原始文�Ӟ��包含多个�|�页的所有信息，所以很大，�q�也是一个有待解决的问题�Q�到底存成大文�g�Q�如果过大会(x��)��过2G�?G的限�Ӟ��而且文�g�q�大索引效率�q�低�Q�还是小文�g�Q�文件数�q�多用于打开关闭文�g句柄的消耗过大）�q�有待思考，�q�就是存储方案的解决最�l�肯定是要存为分布式的，最�l��L��仉��肯定是会(x��)上TB的，TSE只支持小型的搜烦引擎需求�?nbsp;
�?x��)��生一下三个文�?Doc.idx, Url.idx, DocId2Url.idx //Data文�g夹中的Doc.idx DocId2Url.idx和Doc.idx

2、运行命�?sort Url.idx|uniq > Url.idx.sort_uniq //Data文�g夹中的Url.idx.sort_uniq
�?x��)用��C��个文�?Url.idx文�g //md5 hash 之后的url完整地址和document id值对
�?x��)��生一个文�?Url.idx.sort_uniq //URL消重�Q�md5 hash排序�Q�提高检索效�?/p>

3、运行命�?./DocSegment Tianwang.raw.2559638448
�?x��)用��C��个文�?Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文�?�Q�每个页面包含http��_(d��)��分词为后面徏立到排烦引做准备
�?x��)��生一个文�?Tianwang.raw.2559638448.seg //分词文�g�Q�由一行document id号和一行文档分词组�Q�只�Ҏ(gu��)��个文�?lt;html>�?lt;head>�{�文字标��C��的文本进行分�l�）构成

4、运行命�?./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //建立独立的正向烦�?/p>

5、运行命�?br>#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort

6、运行命�?./CrtInvertedIdx moon.fidx.sort > sun.iidx //建立倒排索引

我们先从建立索引的第一个程序DocIndex.cpp开始分析�?注释�U�定�Q�Tianwang.raw.2559638448是抓回来合�ƈ成的大文�Ӟ��后面��叫大文�Ӟ��里面包含了很多篇html文档�Q�里面的文档有规律的分隔��叫做一��一��的文档)

//DocIndex.h start-------------------------------------------------------------

#ifndef _COMM_H_040708_
#define _COMM_H_040708_

#include

#include
#include
#include
#include
#include
#include
#include

using namespace std;

const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //前台搜烦�l�果数据集返回条�?/p>

//iceway
//const unsigned MAX_DOC_IDX_ID = 21312; //DocSegment.cpp中要用到
const unsigned MAX_DOC_IDX_ID = 22104;

//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //倒排索引文�g
//朱�d 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//朱古�?nbsp; 1085 1222

//9万多�?字元文�g 包括�Ҏ(gu��)��W�号�Q�标点，汉字
const string DOC_IDX_NAME("./Data/Doc.idx"); //倒排索引文�g
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");

//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0"; //Docindex.cpp中要用到
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0"; //Snapshot.cpp中要用到

//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");

//const string THUMBNAIL_DIR("/ImgSE/timg/");

#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------

#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"

//iceway(mnsc)
#include "Comm.h"
#include

using namespace std;

int main(int argc, char* argv[])
{
    //ifstream ifs("Tianwang.raw.2559638448");
//ifstream ifs("Tianwang.raw.3023555472");
//iceway(mnsc)
ifstream ifs(DOC_FILE_NAME.c_str()); //打开Tianwang.raw.3023555472文�g�Q�最原始的文�?br> if (!ifs)
{
    cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
    return -1;
    }
ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立�q�打开Url.idx文�g
if( !ofsUrl )
{
  cout << "error open file " << endl;
}

ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立�q�打开Doc.idx文�g
if( !ofsDoc )
{
cout << "error open file " << endl;
}

ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立�q�打开DocId2Url.idx文�g
if( !ofsDocId2Url )
{
cout << "error open file " << endl;
}

int cnt=0; //文档�~�号�?开始计��?br> string strLine,strPage;
CUrl iUrl;
CDocument iDocument;
CMD5 iMD5;

int nOffset = ifs.tellg();
while (getline(ifs, strLine))
{
  if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
  {
   nOffset = ifs.tellg();
   continue;
  }

  if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //判断�W�一行是否是version: 1.0如果是就解析下去
  {
   if(!getline(ifs, strLine)) break;
   if (!strncmp(strLine.c_str(), "url: ", 4)) //判断�W�二行是否是url: 如果是则解析下去
   {
    iUrl.m_sUrl = strLine.substr(5); //截取url: 五个字符之后的url内容
    iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //对url用md5 hash处理
    iUrl.m_sChecksum = iMD5.ToString(); //��字�W�数�l�组合成字符串这个函数在Md5.h中实�?/p>

   } else
   {
    continue;
   }

   while (getline(ifs, strLine))
   {
    if (!strncmp(strLine.c_str(), "length: ", 8)) //一直读下去直到判断�Ҏ(gu��)��(相对�W�五�?惺欠袯��ength: 是则接下下去
    {
     sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //��该块所代表�|�页的实际网��内定w��度放入iDocument数据�l�构�?br>     break;
    }
   }

getline(ifs, strLine); //跌��相对�W�六行故意留的一个空�?/p>

iDocument.m_nDocId = cnt; //��文档编可��值到iDocument数据�l�构�?br> iDocument.m_nPos = nOffset; //文档�l�尾在大文�g中的�l�束行号
char *pContent = new char[iDocument.m_nLength+1]; //新徏该文档长度的字符串指�?/p>

   memset(pContent, 0, iDocument.m_nLength+1); //每一位初始化�?
   ifs.read(pContent, iDocument.m_nLength); //�Ҏ(gu��)��获得的文档长度读取澹(其中包含协议�?��d��文档内容
   iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
   iDocument.m_sChecksum = iMD5.ToString(); //��字�W�数�l�组合成字符串这个函数在Md5.h中实�?br>
   delete[] pContent;

   ofsUrl << iUrl.m_sChecksum ; //��md5hash后的url写入Url.idx文�g
   ofsUrl << "\t" << iDocument.m_nDocId << endl; //在一行中一个tab距离分隔�Q�将文�g�~�号写入Url.idx文�g

   ofsDoc << iDocument.m_nDocId ; //��文件编号写入Doc.idx文�g
   ofsDoc << "\t" << iDocument.m_nPos ; //在一行中一个tab距离分隔�Q�将该文档结束行��h��(同样也是下一文档开始行�?写入Doc.idx文�g
   //ofsDoc << "\t" << iDocument.m_nLength ;
   ofsDoc << "\t" << iDocument.m_sChecksum << endl; //在一行中一个tab距离分隔�Q�将md5hash后的url写入Doc.idx文�g

ofsDocId2Url << iDocument.m_nDocId ; //��文件编号写入DocId2Url.idx文�g
ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //��该文档的完整url写入DocId2Url.idx文�g

cnt++; //文档�~�号加一说明该以文档分析完毕�Q�生成下一文档的编�?br> }

nOffset = ifs.tellg();

}

//最后一行只有文档号和上一��文档结束号
ofsDoc << cnt ;
ofsDoc << "\t" << nOffset << endl;

return(0);
}

//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

学者站在巨人的肩膀�?/a> 2009-12-10 23:00 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[5]倒排索引的徏立及(qi��ng)文�g介绍

Thu, 10 Dec 2009 14:55:00 GMT

不好意思让大家久等了，前一阵一直在忙考试�Q�终于结束了。呵呵！废话不多说了下面我们开始吧�Q?/p>

TSE用的是将抓取回来的网��|��档全部装入一个大文档�Q�让后对�q�一个大文档内的数据整体�l�一的徏索引�Q�其中包含了几个步骤�?/p>

view plaincopy to clipboardprint?
1. The document index (Doc.idx) keeps information about each document.

It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.

The information stored in each entry includes a pointer into the repository,

a document length, a document checksum.

//Doc.idx 文档�~�号文档长度    checksum hash�?nbsp;

0   0   bc9ce846d7987c4534f53d423380ba70

1   76760   4f47a3cad91f7d35f4bb6b2a638420e5

2   141624 d019433008538f65329ae8e39b86026c

3   142350 5705b8f58110f9ad61b1321c52605795

//Doc.idx   end

The url index (url.idx) is used to convert URLs into docIDs.

//url.idx

5c36868a9c5117eadbda747cbdb0725f    0

3272e136dd90263ee306a835c6c70d77    1

6b8601bb3bb9ab80f868d549b5c5a5f3    2

3f9eba99fa788954b5ff7f35a5db6e1f    3

//url.idx   end

It is a list of URL checksums with their corresponding docIDs and is sorted by

checksum. In order to find the docID of a particular URL, the URL's checksum

is computed and a binary search is performed on the checksums file to find its

docID.

    ./DocIndex

        got Doc.idx, Url.idx, DocId2Url.idx //Data文�g夹中的Doc.idx DocId2Url.idx和Doc.idx�?nbsp;

//DocId2Url.idx

0   http://*.*.edu.cn/index.aspx

1   http://*.*.edu.cn/showcontent1.jsp?NewsID=118

2   http://*.*.edu.cn/0102.html

3   http://*.*.edu.cn/0103.html

//DocId2Url.idx end

2. sort Url.idx|uniq > Url.idx.sort_uniq    //Data文�g夹中的Url.idx.sort_uniq

//Url.idx.sort_uniq

//对hash��D��行排�?nbsp;

000bfdfd8b2dedd926b58ba00d40986b    1111

000c7e34b653b5135a2361c6818e48dc    1831

0019d12f438eec910a06a606f570fde8    366

0033f7c005ec776f67f496cd8bc4ae0d    2103

3. Segment document to terms, (with finding document according to the url)

    ./DocSegment Tianwang.raw.2559638448        //Tianwang.raw.2559638448为爬回来的文�?�Q�每个页面包含http�?nbsp;

        got Tianwang.raw.2559638448.seg

//Tianwang.raw.2559638448   爬取的原始网��|��件在文档内部每一个文档之间应该是通过version�Q?lt;/html>和回车做标志位分割的

version: 1.0

url: http://***.105.138.175/Default2.asp?lang=gb

origin: http://***.105.138.175/

date: Fri, 23 May 2008 20:01:36 GMT

ip: 162.105.138.175

length: 38413

HTTP/1.1 200 OK

Server: Microsoft-IIS/5.0

Date: Fri, 23 May 2008 11:17:49 GMT

Connection: keep-alive

Connection: Keep-Alive

Content-Length: 38088

Content-Type: text/html; Charset=gb2312

Expires: Fri, 23 May 2008 11:17:49 GMT

Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/

Cache-control: private

"





Apabi数字资源�q�_��























//Tianwang.raw.2559638448   end

//Tianwang.raw.2559638448.seg   ��每个页面分成一行如�?注意中间没有回�R作�ؓ(f��)分隔)

1

...

...

...

2

...

...

...

//Tianwang.raw.2559638448.seg   end

//下是 Tiny search 非必��d��?nbsp;

4. Create forward index (docic-->termid)     //建立正向索引

    ./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx

//Tianwang.raw.2559638448.seg ��每个页面分成一行如�?lt;BR>//分词   DocID
1
三星/ s/ 手机/ 论坛/ ,/ 手机/ 铃声/ 下蝲/ ,/ 手机/ 囄��/ 下蝲/ ,/ 手机/
2
...
...
...

1. The document index (Doc.idx) keeps information about each document.

It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.

The information stored in each entry includes a pointer into the repository,

a document length, a document checksum.

//Doc.idx 文档�~�号文档长度 checksum hash�?/p>

0 0 bc9ce846d7987c4534f53d423380ba70

1 76760 4f47a3cad91f7d35f4bb6b2a638420e5

2 141624 d019433008538f65329ae8e39b86026c

3 142350 5705b8f58110f9ad61b1321c52605795

//Doc.idx end

The url index (url.idx) is used to convert URLs into docIDs.

//url.idx

5c36868a9c5117eadbda747cbdb0725f 0

3272e136dd90263ee306a835c6c70d77 1

6b8601bb3bb9ab80f868d549b5c5a5f3 2

3f9eba99fa788954b5ff7f35a5db6e1f 3

//url.idx end

It is a list of URL checksums with their corresponding docIDs and is sorted by

checksum. In order to find the docID of a particular URL, the URL's checksum

is computed and a binary search is performed on the checksums file to find its

docID.

./DocIndex

got Doc.idx, Url.idx, DocId2Url.idx //Data文�g夹中的Doc.idx DocId2Url.idx和Doc.idx�?/p>

//DocId2Url.idx

0 http://*.*.edu.cn/index.aspx

1 http://*.*.edu.cn/showcontent1.jsp?NewsID=118

2 http://*.*.edu.cn/0102.html

3 http://*.*.edu.cn/0103.html

//DocId2Url.idx end

2. sort Url.idx|uniq > Url.idx.sort_uniq //Data文�g夹中的Url.idx.sort_uniq

//Url.idx.sort_uniq

//对hash��D��行排�?/p>

000bfdfd8b2dedd926b58ba00d40986b 1111

000c7e34b653b5135a2361c6818e48dc 1831

0019d12f438eec910a06a606f570fde8 366

0033f7c005ec776f67f496cd8bc4ae0d 2103

3. Segment document to terms, (with finding document according to the url)

./DocSegment Tianwang.raw.2559638448 //Tianwang.raw.2559638448为爬回来的文�?�Q�每个页面包含http�?/p>

got Tianwang.raw.2559638448.seg

//Tianwang.raw.2559638448 爬取的原始网��|��件在文档内部每一个文档之间应该是通过version�Q?lt;/html>和回车做标志位分割的

version: 1.0

url: http://***.105.138.175/Default2.asp?lang=gb

origin: http://***.105.138.175/

date: Fri, 23 May 2008 20:01:36 GMT

ip: 162.105.138.175

length: 38413

HTTP/1.1 200 OK

Server: Microsoft-IIS/5.0

Date: Fri, 23 May 2008 11:17:49 GMT

Connection: keep-alive

Connection: Keep-Alive

Content-Length: 38088

Content-Type: text/html; Charset=gb2312

Expires: Fri, 23 May 2008 11:17:49 GMT

Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/

Cache-control: private

Apabi数字资源�q�_��

//Tianwang.raw.2559638448 end

//Tianwang.raw.2559638448.seg ��每个页面分成一行如�?注意中间没有回�R作�ؓ(f��)分隔)

...

//Tianwang.raw.2559638448.seg end

//下是 Tiny search 非必��d��?/p>

4. Create forward index (docic-->termid) //建立正向索引

./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx

//Tianwang.raw.2559638448.seg ��每个页面分成一行如�?/分词   DocID1三星/ s/ 手机/ 论坛/ ,/ 手机/ 铃声/ 下蝲/ ,/ 手机/ 囄��/ 下蝲/ ,/ 手机/2.........view plaincopy to clipboardprint?
//Tianwang.raw.2559638448.seg end

//moon.fidx

//每篇文档号对应文档内分出来的    分词 DocID

都会(x��) 2391

�?nbsp; 2391

那些 2391

拥有 2391

�?nbsp; 2391

�?nbsp; 2391

�?nbsp; 2391

�?nbsp; 2391

视野 2391

�?nbsp; 2391

�H?nbsp; 2391

�?nbsp; 2180

研究生部    2180

主页 2180

培养 2180

��理 2180

栏目 2180

下蝲 2180

�Q?nbsp; 2180

�?nbsp; 2180

关于 2180

做好 2180

�q?nbsp; 2180

国家 2180

公派 2180

研究�?2180

��目 2180

//moon.fidx end

5.# set | grep "LANG"

LANG=en; export LANG;

sort moon.fidx > moon.fidx.sort

6. Create inverted index (termid-->docid)    //建立倒排索引

    ./CrtInvertedIdx moon.fidx.sort > sun.iidx

//sun.iidx //文�g规模大概减少1/2

花工   236

花�v   2103

花卉   1018 1061 1061 1061 1730 1730 1730 1730 1730 1852 949 949

��p��   447 447

花木   1061

花呢   1430

花期   447 447 447 447 447 525

花钱   174 236

��p��   1730 1730

��p��品种     1660

��q��   450 526

花式   1428 1430 1430 1430

��q��   1430 1430

花序   447 447 447 447 447 450

��q�Q   136 137

��p��   450 450

//sun.iidx end

TSESearch   CGI program for query

Snapshot    CGI program for page snapshot

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

学者站在巨人的肩膀�?/a> 2009-12-10 22:55 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[4]��结

Thu, 10 Dec 2009 14:54:00 GMT

通过前面的三��文章相信你已经对神�U�的搜烦引擎有了一个感性的认识�Q�和普通的php�c�M��的脚本语�a�服务器类��|��通过获取前台关键字，通过字典分词�Q�和事先建立建立好的倒排索引�q�行相关性分析，得出查询�l�构格式化输出结果。而这里的技术难点在�?/p>

1、字典的选取�Q�事实上�Ҏ(gu��)��不同时代不同地方��Z��的语�a��?f��n)惯是不一��L(f��ng)��所以说字典的最��元的取值是不同的）

2、倒排索引的徏立（�q�里��p��涉及(qi��ng)到爬虫的抓取和烦引的建立后面��重点介�l�这2点，搜烦引擎的效率和服务质量实效性瓶颈在�q�里�Q?/p>

3、相��x��分析（�Ҏ(gu��)��回来的文档分词徏索引和用户关键字分词��法上要对应�Q?/p>

后面文章�?x��)重点介�l�爬虫的抓取和烦引的建立�?/p>

学者站在巨人的肩膀�?/a> 2009-12-10 22:54 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[3]来到关键字分词及(qi��ng)相关性分析程�?

Thu, 10 Dec 2009 14:53:00 GMT

有前面注释我们可以知道查询关键字和字典文件准备好好后�Q�将�q�入用户关键字分词阶�D?/p>

//TSESearch.cpp中：(x��)

view plaincopy to clipboardprint?
CHzSeg iHzSeg;      //include ChSeg/HzSeg.h

//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //��get到的查询变量分词分成 "�?        �?      你们/ �?      格式"

vector vecTerm;
iQuery.ParseQuery(vecTerm);     //��以"/"划分开的关键字一一��序攑օ�一个向量容器中

set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);

gettimeofday(&end_tv,&tz);
// search end
//搜烦完毕

CHzSeg iHzSeg; //include ChSeg/HzSeg.h

//
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //��get到的查询变量分词分成 "�? �? 你们/ �? 格式"

vector vecTerm;
iQuery.ParseQuery(vecTerm); //��以"/"划分开的关键字一一��序攑օ�一个向量容器中

set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);

gettimeofday(&end_tv,&tz);
// search end
//搜烦完毕view plaincopy to clipboardprint?
看CHzSeg 中的�q�个�Ҏ(gu��)��

看CHzSeg 中的�q�个�Ҏ(gu��)��view plaincopy to clipboardprint?
//ChSeg/HzSeg.h

//ChSeg/HzSeg.hview plaincopy to clipboardprint?
/**
* �E�序��译说明
* �q�一步净化数据，转换汉字
* @access public
* @param   CDict, string 参数的汉字说�?字典�Q�查询字�W�串
* @return string 0
*/
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
    string s2="";
    unsigned int i,len;

    while (!s1.empty())
    {
        unsigned char ch=(unsigned char) s1[0];
        if(ch<128)
        { // deal with ASCII
            i=1;
            len = s1.size();
            while (i=161)
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
                {
                    ii=i+2; // 假定没有半个汉字
                }

                if (i==0) ii=i+2;

                // 不处理中文空�?nbsp;
                if (!(ch==161 && (unsigned char)s1[1]==161))
                {
                    if (i <= s1.size()) // yhf
                        // 其他的非汉字双字节字�W�可能连�l�输�?nbsp;
                        s2 += s1.substr(0, i) + SEPARATOR;
                    else break; // yhf
                }

                if (i <= s1.size()) // yhf
                    s1s1=s1.substr(i);
                else break;     //yhf

                continue;
            }
        }


    // 以下处理汉字�?nbsp;

        i = 2;
        len = s1.length();

        while(i=176)
//    while(i=128 && (unsigned char)s1[i]!=161)
            i+=2;

        s2+=SegmentHzStrMM(dict, s1.substr(0,i));

        if (i <= len)    // yhf
            s1s1=s1.substr(i);
        else break; // yhf
    }

    return s2;
}

/**
* �E�序��译说明
* �q�一步净化数据，转换汉字
* @access public
* @param CDict, string 参数的汉字说�?字典�Q�查询字�W�串
* @return string 0
*/
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
string s2="";
unsigned int i,len;

while (!s1.empty())
{
  unsigned char ch=(unsigned char) s1[0];
  if(ch<128)
  { // deal with ASCII
   i=1;
   len = s1.size();
   while (i=161)
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
    {
     i=i+2; // 假定没有半个汉字
    }

if (i==0) i=i+2;

    // 不处理中文空�?br>    if (!(ch==161 && (unsigned char)s1[1]==161))
    {
     if (i <= s1.size()) // yhf
      // 其他的非汉字双字节字�W�可能连�l�输�?br>      s2 += s1.substr(0, i) + SEPARATOR;
     else break; // yhf
    }

    if (i <= s1.size()) // yhf
     s1=s1.substr(i);
    else break;  //yhf

    continue;
   }
  }

// 以下处理汉字�?/p>

i = 2;
len = s1.length();

  while(i=176)
//    while(i=128 && (unsigned char)s1[i]!=161)
   i+=2;

s2+=SegmentHzStrMM(dict, s1.substr(0,i));

  if (i <= len) // yhf
   s1=s1.substr(i);
  else break; // yhf
}

return s2;
}view plaincopy to clipboardprint?

view plaincopy to clipboardprint?
//Query.cpp

//Query.cppview plaincopy to clipboardprint?

/**   
 * �E�序���译说明   
 * ���以"/"划分开的关键字一一��序攑օ�一个向量容器中   
 *   
 * @access  public   
 * @param   vector 参数的汉字说明：(x��)向量容器   
 * @return  void   
 */   
void CQuery::ParseQuery(vector &vecTerm)   
{   
    string::size_type idx;    
    while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {    
        vecTerm.push_back(m_sSegQuery.substr(0,idx));    
        m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);    
    }   
}

/**   
 * �E�序���译说明   
 * 相关性分析查询，构造结果集合setRelevantRst //瓉���所�?nbsp;  
 *   
 * @access  public   
 * @param   vector map set 参数的汉字说明：(x��) 用户提交关键字的分词�l�，倒排索引映射�Q�相��x��结果集�?nbsp;  
 * @return  string 0   
 */   
bool CQuery::GetRelevantRst   
(   
    vector &vecTerm,    
    map &mapBuckets,    
    set &setRelevantRst   
) const   
{   
    set setSRst;   
  
    bool bFirst=true;   
    vector::iterator itTerm = vecTerm.begin();   
  
    for ( ; itTerm != vecTerm.end(); ++itTerm )   
    {   
  
        setSRst.clear();   
        copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));   
  
        map mapRstDoc;   
        string docid;   
        int doccnt;   
  
        map::iterator itBuckets = mapBuckets.find(*itTerm);   
        if (itBuckets != mapBuckets.end())   
        {   
            string strBucket = (*itBuckets).second;   
            string::size_type idx;   
            idx = strBucket.find_first_not_of(" ");   
            strBucketstrBucket = strBucket.substr(idx);   
  
            while ( (idx = strBucket.find(" ")) != string::npos )    
            {   
                docid = strBucket.substr(0,idx);   
                doccnt = 0;   
  
                if (docid.empty()) continue;   
  
                map::iterator it = mapRstDoc.find(docid);   
                if ( it != mapRstDoc.end() )   
                {   
                    doccnt = (*it).second + 1;   
                    mapRstDoc.erase(it);   
                }   
                mapRstDoc.insert( pair(docid,doccnt) );   
  
                strBucketstrBucket = strBucket.substr(idx+1);   
            }   
  
            // remember the last one   
            docid = strBucket;   
            doccnt = 0;   
            map::iterator it = mapRstDoc.find(docid);   
            if ( it != mapRstDoc.end() )   
            {   
                doccnt = (*it).second + 1;   
                mapRstDoc.erase(it);   
            }   
            mapRstDoc.insert( pair(docid,doccnt) );   
        }   
  
        // sort by term frequencty   
        multimap > newRstDoc;   
        map::iterator it0 = mapRstDoc.begin();   
        for ( ; it0 != mapRstDoc.end(); ++it0 ){   
            newRstDoc.insert( pair((*it0).second,(*it0).first) );   
        }   
  
        multimap::iterator itNewRstDoc = newRstDoc.begin();   
        setRelevantRst.clear();   
        for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){   
            string docid = (*itNewRstDoc).second;   
  
            if (bFirst==true) {   
                setRelevantRst.insert(docid);   
                continue;   
            }   
  
            if ( setSRst.find(docid) != setSRst.end() ){       
                setRelevantRst.insert(docid);   
            }   
        }   
  
        //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "
";   
        bFirst = false;   
    }   
    return true;   
}

接下来的��是现实了，前面都只是处理数据得�?setRelevantRst �q�个查询�l�构集合,�q�里��׃��多说了下面就和php之类的脚本语�a�差不多，格式化结果集合�ƈ昄��出来�?nbsp;

view plaincopy to clipboardprint?/**   * �E�序��译说明   * ��以"/"划分开的关键字一一��序攑օ�一个向量容器中   *   * @access public   * @param   vector 参数的汉字说明：(x��)向量容器   * @return void   */ void CQuery::ParseQuery(vector &vecTerm)   {       string::size_type idx;        while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {            vecTerm.push_back(m_sSegQuery.substr(0,idx));            m_sSegQuery = m_sSegQuery.substr(idx+3);        }   } /**
* �E�序��译说明
* ��以"/"划分开的关键字一一��序攑օ�一个向量容器中
*
* @access public
* @param   vector 参数的汉字说明：(x��)向量容器
* @return void
*/
void CQuery::ParseQuery(vector &vecTerm)
{
string::size_type idx;
while ( (idx = m_sSegQuery.find("/ ")) != string::npos ) {
  vecTerm.push_back(m_sSegQuery.substr(0,idx));
  m_sSegQuery = m_sSegQuery.substr(idx+3);
}
}

view plaincopy to clipboardprint?
view plaincopy to clipboardprint?

/**   * �E�序���译说明   * 相关性分析查询，构造结果集合setRelevantRst //瓉���所�?nbsp;  *   * @access  public   * @param   vector map set 参数的汉字说明：(x��) 用户提交关键字的分词�l�，倒排索引映射�Q�相��x��结果集�?nbsp;  * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector &vecTerm,        map &mapBuckets,        set &setRelevantRst   ) const  {       set setSRst;         bool bFirst=true;       vector::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "
";           bFirst = false;       }       return true;   }

view plaincopy to clipboardprint?/**   * �E�序��译说明   * 相关性分析查询，构造结果集合setRelevantRst //瓉��所�?nbsp; *   * @access public   * @param   vector map set 参数的汉字说明：(x��) 用户提交关键字的分词�l�，倒排索引映射�Q�相��x��结果集�?nbsp; * @return string 0   */ bool CQuery::GetRelevantRst   (       vector &vecTerm,        map &mapBuckets,        set &setRelevantRst   ) const {       set setSRst;         bool bFirst=true;       vector::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "
";           bFirst = false;       }       return true;   } /**
* �E�序��译说明
* 相关性分析查询，构造结果集合setRelevantRst //瓉��所�?br> *
* @access public
* @param   vector map set 参数的汉字说明：(x��) 用户提交关键字的分词�l�，倒排索引映射�Q�相��x��结果集�?br> * @return string 0
*/
bool CQuery::GetRelevantRst
(
vector &vecTerm,
map &mapBuckets,
set &setRelevantRst
) const
{
set setSRst;

bool bFirst=true;
vector::iterator itTerm = vecTerm.begin();

for ( ; itTerm != vecTerm.end(); ++itTerm )
{

setSRst.clear();
copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));

  map mapRstDoc;
  string docid;
  int doccnt;

  map::iterator itBuckets = mapBuckets.find(*itTerm);
  if (itBuckets != mapBuckets.end())
  {
   string strBucket = (*itBuckets).second;
   string::size_type idx;
   idx = strBucket.find_first_not_of(" ");
   strBucket = strBucket.substr(idx);

   while ( (idx = strBucket.find(" ")) != string::npos )
   {
    docid = strBucket.substr(0,idx);
    doccnt = 0;

if (docid.empty()) continue;

    map::iterator it = mapRstDoc.find(docid);
    if ( it != mapRstDoc.end() )
    {
     doccnt = (*it).second + 1;
     mapRstDoc.erase(it);
    }
    mapRstDoc.insert( pair(docid,doccnt) );

strBucket = strBucket.substr(idx+1);
}

   // remember the last one
   docid = strBucket;
   doccnt = 0;
   map::iterator it = mapRstDoc.find(docid);
   if ( it != mapRstDoc.end() )
   {
    doccnt = (*it).second + 1;
    mapRstDoc.erase(it);
   }
   mapRstDoc.insert( pair(docid,doccnt) );
  }

  // sort by term frequencty
  multimap > newRstDoc;
  map::iterator it0 = mapRstDoc.begin();
  for ( ; it0 != mapRstDoc.end(); ++it0 ){
   newRstDoc.insert( pair((*it0).second,(*it0).first) );
  }

  multimap::iterator itNewRstDoc = newRstDoc.begin();
  setRelevantRst.clear();
  for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
   string docid = (*itNewRstDoc).second;

   if (bFirst==true) {
    setRelevantRst.insert(docid);
    continue;
   }

   if ( setSRst.find(docid) != setSRst.end() ){
    setRelevantRst.insert(docid);
   }
  }

//cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
bFirst = false;
}
return true;
}

接下来的��是现实了，前面都只是处理数据得�?setRelevantRst �q�个查询�l�构集合,�q�里��׃��多说了下面就和php之类的脚本语�a�差不多，格式化结果集合�ƈ昄��出来�?br>//TSESearch.cpp

view plaincopy to clipboardprint?
//下面开始显�C?nbsp;
    CDisplayRst iDisplayRst;
    iDisplayRst.ShowTop();

    float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
        +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;

    iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
            setRelevantRst.size(), iQuery.m_iStart);

    iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

学者站在巨人的肩膀�?/a> 2009-12-10 22:53 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[2]路过查询处理�E�序

Thu, 10 Dec 2009 14:52:00 GMT

�׃��一��文章[原]自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[1]��L��搜烦引擎入口我们可以知道整个�E�序是从TSESearch.cpp 中的main函数开始的我们重点一下这�D�代�?/p>

//TSESearch.cpp CQuery iQuery;
iQuery.GetInputs(); //具体�E�序开始执�?br> // current query & result page number
iQuery.SetQuery();
iQuery.SetStart();

// begin to search
//开始具体搜索程�?br> gettimeofday(&begin_tv,&tz); //开始计时获取程序运行时间差

iQuery.GetInvLists(mapBuckets);  //��所有字�W�集存入映射变量�?nbsp;瓉��所�?br> iQuery.GetDocIdx(vecDocIdx);  //��倒排索引存入向量�?nbsp; 瓉��所�?br>
CHzSeg iHzSeg;  //include ChSeg/HzSeg.h
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //��get到的查询变量分词分成 "�?  �?  你们/ �?  格式"

vector vecTerm;
iQuery.ParseQuery(vecTerm);  //��以"/"划分开的关键字一一��序攑օ�一个向量容器中

set setRelevantRst;
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);

gettimeofday(&end_tv,&tz);
// search end
//搜烦完毕按照��序我们首先深入�q�iQuery对象的类CQuery

//Query.cpp

1、GetInputs

�q�个�Ҏ(gu��)��的功能是��前台get�q�来的变量�{换到HtmlInputs�l�构体数�l�中如下例子和代码：(x��)

//假设前台查询的关键字�?1"着HtmlInputs中内容输出如�?nbsp; //HtmlInputs[0].Name word //HtmlInputs[0].Value 1 //HtmlInputs[1].Name www //HtmlInputs[1].Value 搜烦 //HtmlInputs[2].Name cdtype //HtmlInputs[2].Value GB

/*
* Get form information throught environment varible.
* return 0 if succeed, otherwise exit.
*/
/**
* �E�序��译说明
* 处理GET�q�来的表�?br> *
* @access public
* @return string 0
*/
int CQuery::GetInputs()
{
    int i,j;
char *mode = getenv("REQUEST_METHOD"); //�q�回环境变量的�?�q�里环境变量 REQUEST_METHOD �?get �Ҏ(gu��)��
    char *tempstr; //GET变量字符串或POST字符串内�?br> char *in_line;
int length;  //GET变量串长度或POST内容长度

cout << "Content-type: text/html\n\n";
//cout << "Cache-Control: no-cache\n";
//cout << "Expires: Tue, 08 Apr 1997 17:20:00 GMT\n";
//cout << "Expires: 0\n";
//cout << "Pragma: no-cache\n\n";

cout << "\n";
cout << "\n";
//cout << "\n";
//cout << "\n";
//cout << "\n";
cout << "\n";
cout.flush(); //释放输出�~�冲�?输出头部head和之前的html标签内容
//cout <<"" << endl;

if (mode==NULL) return 1;

if (strcmp(mode, "POST") == 0)
{
  length = atoi(getenv("CONTENT_LENGTH")); //如果是POST�Ҏ(gu��)��着获得环境变量CONTENT_LENGTH的整型�?br>  if (length==0 || length>=256)
   return 1;
  in_line = (char*)malloc(length + 1);
  read(STDIN_FILENO, in_line, length);
  in_line[length]='\0';
}
else if (strcmp(mode, "GET") == 0)
{
  char* inputstr = getenv("QUERY_STRING"); //如果是GET�Ҏ(gu��)��着获得环境变量QUERY_STRING的字�W�串�?br>  length = strlen(inputstr);
  if (inputstr==0 || length>=256)
   return 1;

  //获取get内容长度�q�把get �Q�后面的参数赋值给变量in_line
  in_line = (char*)malloc(length + 1);
  strcpy(in_line, inputstr); //��心溢出��d��
}

tempstr = (char*)malloc(length + 1); //获取post内容或get内容长度
if(tempstr == NULL)
{
  printf("\n");
  printf("\n");
  printf("Major failure #1;please notify the webmaster\n");
  printf("\n");
  fflush(stdout); //输出�~�冲�?br>  exit(2); //错误�q�回
}

j=0;
for (i=0; i char
   strcpy(HtmlInputs[HtmlInputCount].Name,tempstr);
   if (i == length - 1)
   {
    strcpy(HtmlInputs[HtmlInputCount].Value,"");
    HtmlInputCount++;
   }
   j=0;
  }
  else if ((in_line[i] == '&') || (i==length-1))
  {
   if (i==length-1)
   {
    if(in_line[i] == '+')tempstr[j]=' ';
    else tempstr[j] = in_line[i];
    j++;
   }
   tempstr[j]='\0';
   CStrFun::Translate(tempstr); //��URL�~�码形式的参数�{换成字符�?%** -> char
   strcpy(HtmlInputs[HtmlInputCount].Value,tempstr);
   HtmlInputCount++;
   j=0;
  }
  else if (in_line[i] == '+')
  {
   tempstr[j]=' ';
   j++;
  }
  else
  {
   tempstr[j]=in_line[i]; //�l�合get中的变量如word www cdtype
   j++;
  }
  //cout<";
  //cout<";
  //cout.flush();
}

/*
for (int kk = 0; kk < HtmlInputCount ; ++kk )
{
cout<<"Name="<";
cout<<"Value="<";
}
//假设前台查询的关键字�?1"输出如下
//Name=word
//Value=1
//Name=www
//Value= 搜烦
//Name=cdtype
//Value=GB
*/

if(in_line) free(in_line);
if(tempstr) free(tempstr);

return 0;
}

2、SetQuery

//Query.cpp
void CQuery::SetQuery()
{
string q = HtmlInputs[0].Value;
CStrFun::Str2Lower(q,q.size()); //大写变小�?br> m_sQuery = q; //准备查询关键�?br>}
3、SetStart
void CQuery::SetQuery()
{
string q = HtmlInputs[0].Value;
CStrFun::Str2Lower(q,q.size()); //大写变小写word变量里的�?br> m_sQuery = q; //讄��查询关键�?br>}

4、GetInvLists
bool CQuery::GetInvLists(map &mapBuckets) const
{
ifstream ifsInvInfo(INF_INFO_NAME.c_str(), ios::binary); //以二�q�制形式打开一个文件的输入��缓�Ԍ��INF_INFO_NAME在头文�gComm.h中定义了的， const string INF_INFO_NAME("./Data/sun.iidx");
//倒排索引文�g索引字和文档好之间有一个制表符"\t"
//朱�d 14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//朱古�?nbsp; 1085 1222

if (!ifsInvInfo) {
cerr << "Cannot open " << INF_INFO_NAME << " for input\n";
return false;
}
string strLine, strWord, strDocNum;
//以行��d��输入��缓冲到字符串对象strLine中�ƈ做处�?br> while (getline(ifsInvInfo, strLine)) {
string::size_type idx;
string tmp;
idx = strLine.find("\t");
strWord = strLine.substr(0,idx);
strDocNum = strLine.substr(idx+1);
mapBuckets.insert(map::value_type (strWord, strDocNum)); //倒排表二��二�l�表存入映射�?br>
/*
map::iterator iter;
int kkk = 0;
for (iter = mapBuckets.begin(); kkk != 10; ++iter)
{
   cout<first<<" "<second<<"
";
   ++kkk;
}
cout.flush();
*/
}
return true;
}

5、GetDocIdx

bool CQuery::GetDocIdx(vector &vecDocIdx) const
{
ifstream ifs(DOC_IDX_NAME.c_str(), ios::binary);
//0  0  bc9ce846d7987c4534f53d423380ba70
//1  76760 4f47a3cad91f7d35f4bb6b2a638420e5
//2  141624 d019433008538f65329ae8e39b86026c

if (!ifs) {
cerr << "Cannot open " << DOC_IDX_NAME << " for input\n"; //以二�q�制形式打开一个文件的输入��缓�Ԍ��DOC_IDX_NAME在头文�gComm.h中定义了的， const string INF_INFO_NAME("./Data/Doc.idx");
return false;
}

string strLine, strDocid, strUrl;
while (getline(ifs,strLine)){
DocIdx di;

sscanf( strLine.c_str(), "%d%d", &di.docid, &di.offset ); //只保留了前面两项文档号和偏移�?br> vecDocIdx.push_back(di); //导入�l�构体向量中
}

return true;
}

学者站在巨人的肩膀�?/a> 2009-12-10 22:52 发表评论

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[1]��L��搜烦引擎入口

Thu, 10 Dec 2009 14:51:00 GMT

�׃��癑ֺ�博客http://hi.baidu.com/jrckkyy发表文章字数有限�Q�以后原创文章全部都先发表到csdn和cu上，再发表到癑ֺ�博客上，癑ֺ�博客除了攑֎�创的文章�q�主要放�|�上��L��到的优秀文章�?/p>

本着黑客�_��我将陆箋把最�q�分析注释TSE搜烦引擎的心得发布出来，老鸟�Q�大虾，大牛�Q�高手飘�q�就是了�Q�若愿意��费指点下小弟的在下不甚感激�Q�有问题的朋友直接留�a�讨论。由于本人水�q�x��限，分析和翻译难免有错大家见�W�了�?/p>

上学期拜��M��James F.Kurose著的《计��机�|�络-自顶向下�Ҏ(gu��)��与internet特色(�W�三版阴�?》，觉得写得��实不错(希望没看的朋友一定要买来看看)�Q�自�׃��来搞个高自顶向下的学�?f��n)方法，先从用户看得到的东西出发分析研究搜烦引擎�Q�下面我们就来看看各大搜索引擎搜索界面的代码�Q�你所需要特别注意的是form表单中的action

雅虎http://www.yohoo.com/�Q?/p>

��h��http://www.g.cn�Q?/p>

癑ֺ�http://www.baidu.com�Q?/p>

' + rescode[i][0] + ''); } else ...{ document.write(''); } } document.f.range.selectedIndex = 0; --> -

��试服务器TSE�Q?/p>

�׃��上几个form的属性可以看出全部采用的是get�Ҏ(gu��)��Q�CGI做�ؓ(f��)处理�E�序�Q�也��是C/C++�Q�CGI全称�?#8220;公共�|�关界面”(Common Gateway Interface)�Q�HTTP服务器与你的或其它机器上的程序进�?#8220;交谈”的一�U�工��P��其程序须�q�行在网�l�服务器上。CGI逐渐被近几年来的PHP�Q�JAVA�Q�ASP�Q�PERL�Q�Python�Q�Ruby�{�动态语�a�所取代。但是其在速度和运行效率上的优势是无法取代的�?/p>

以下是TSE CGI入口�E�序注释�Q�其他搜索引擎的入口也应该类�?/p>

/**//**
* �E�序��译说明
* @Copyright (c) 2008, 研发�?br> * All rights reserved.
*
* @filesource TSESearch.cpp
* @author jrckkyy <jrckkyy@163.com>
*
* Let's start
*
*/
#include
#include
#include
#include
#include
#include
#include
#include

#include
#include
#include

#include "Comm.h"    //包含2个烦引和1个数据文�?br>#include "Query.h"    //包含数据查询处理头文�?br>#include "Document.h"    //html文档处理头文�?br>#include "StrFun.h"        //字符串处理头文�g
#include "ChSeg/Dict.h"    //字元字典处理头文�?br>#include "ChSeg/HzSeg.h"
#include "DisplayRst.h"    //�q�回查询�l�果��面头文�Ӟ��q�回�l�果分�ؓ(f��)头部�Q�中部，底部

using namespace std;

/**//*
* A inverted file(INF) includes a term-index file & a inverted-lists file.
* A inverted-lists consists of many bucks(posting lists).
* The term-index file is stored at vecTerm, and
* the inverted-lists is sored at mapBuckets.
*/

/**//**
* �E�序��译说明
* 搜烦�E�序入口前台关键字提交到该cgi�E�序例如�Q?/cgi-bin/index/TSESearch?word=123&start=1
* 倒排文�g包括一个记录检索词文�g和一个倒排列表文�g�?br> * 倒排列表包含很多标志�Q�提交名单）�?br> * 记录��索词文�g使用vecTerm来排序，和倒排列表是用mapBuckets来排序�?br> *
* @access public
* @param   int char 参数的汉字说�?用于接收前台get传递的参数
* @return string 0
*/
int main(int argc, char* argv[])
...{
    struct timeval begin_tv, end_tv;
    struct timezone tz;

    CDict iDict;
    map dictMap, mapBuckets;
    vector vecDocIdx;    //Document。h

    CQuery iQuery;
    iQuery.GetInputs();        //具体�E�序开始执�?br>    // current query & result page number
    iQuery.SetQuery();
    iQuery.SetStart();

// begin to search
//开始具体搜索程�?br> gettimeofday(&begin_tv,&tz); //开始计时获取程序运行时间差

    iQuery.GetInvLists(mapBuckets);        //��所有字�W�集存入映射变量�?nbsp;   瓉��所�?br>    iQuery.GetDocIdx(vecDocIdx);        //��倒排索引存入向量�?nbsp;       瓉��所�?br>
    CHzSeg iHzSeg;        //include ChSeg/HzSeg.h
    iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery);    //��get到的查询变量分词分成 "�?        �?        你们/    �?        格式"

    vector vecTerm;
    iQuery.ParseQuery(vecTerm);        //��以"/"划分开的关键字一一��序攑օ�一个向量容器中

    set setRelevantRst;
    iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);

    gettimeofday(&end_tv,&tz);
    // search end
    //搜烦完毕

//下面开始显�C?br> CDisplayRst iDisplayRst;
iDisplayRst.ShowTop();

float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
+((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;

iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
setRelevantRst.size(), iQuery.m_iStart);

iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

return 0;

}

学者站在巨人的肩膀�?/a> 2009-12-10 22:51 发表评论

久久热这里只有精品在线观看,一个色综合久久,久久久久亚洲AV成人网人人网站

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[6]倒排索引的徏立的�E�序分析(4)

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[6]倒排索引的徏立的�E�序分析(2)

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[6]倒排索引的徏立的�E�序分析(3)

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[6]倒排索引的徏立的�E�序分析(1)

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[5]倒排索引的徏立及(qi��ng)文�g介绍

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[4]���结

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[3]来到关键字分词及(qi��ng)相关性分析程�?

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[2]路过查询处理�E�序

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[1]��L��搜烦引擎入口

自顶向下学搜索引擎——北大天�|�搜索引擎TSE分析�?qi��ng)完全注释[4]��结