??xml version="1.0" encoding="utf-8" standalone="yes"?>久久人人爽人人人人片av,久久亚洲综合色一区二区三区,91久久精品电影http://www.shnenglu.com/jrckkyy/category/12532.html金融数学,InformationSearch,Compiler,OS,zh-cnThu, 10 Dec 2009 15:07:47 GMTThu, 10 Dec 2009 15:07:47 GMT60自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[6]倒排索引的徏立的E序分析(4)http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102949.html学者站在巨人的肩膀?/dc:creator>学者站在巨人的肩膀?/author>Thu, 10 Dec 2009 15:03:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102949.htmlhttp://www.shnenglu.com/jrckkyy/comments/102949.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102949.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102949.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102949.html以下是根据正向烦(ch)引徏立倒排索引的注?/p>

 

int main(int argc, char* argv[])    //./CrtInvertedIdx moon.fidx.sort > sun.iidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo) 
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum,tmp1="";
    int cnt = 0;
    while (getline(ifsImgInfo, strLine)) 
    {
        string::size_type idx;
        string tmp;


        idx = strLine.find("\t");
        tmp = strLine.substr(0,idx);

        if (tmp.size()<2 || tmp.size() > 8) continue;

        if (tmp1.empty()) tmp1=tmp;

        if (tmp == tmp1) 
        {
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }
        else 
        {
            if ( strDocNum.empty() )
                strDocNum = strDocNum + " " + strLine.substr(idx+1);

            cout << tmp1 << "\t" << strDocNum << endl;
            tmp1 = tmp;
            strDocNum.clear();
            strDocNum = strDocNum + " " + strLine.substr(idx+1);
        }

        cnt++;
        //if (cnt==100) break;
    }
    cout << tmp1 << "\t" << strDocNum << endl;  //倒排索引中每个字典单词后的文档编号以table键ؓ(f)间隔

    return 0;
}

 

 



]]>
自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[6]倒排索引的徏立的E序分析(2)http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102947.html学者站在巨人的肩膀?/dc:creator>学者站在巨人的肩膀?/author>Thu, 10 Dec 2009 15:02:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102947.htmlhttp://www.shnenglu.com/jrckkyy/comments/102947.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102947.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102947.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102947.html前面的DocIndexE序输入一个Tianwang.raw.*****文gQ会(x)产生一下三个文?Doc.idx, Url.idx, DocId2Url.idxQ我们这里对DocSegmentE序q行分析?/p>

q里输入 Tianwang.raw.*****QDoc.idxQUrl.idx.sort_uniq{三个文Ӟ输出一个Tianwang.raw.***.seg 分词完毕的文?/p>

int main(int argc, char* argv[])
{
    string strLine, strFileName=argv[1];
    CUrl iUrl;
    vector<CUrl> vecCUrl;
    CDocument iDocument;
    vector<CDocument> vecCDocument;
    unsigned int docId = 0;

    //ifstream ifs("Tianwang.raw.2559638448");
    ifstream ifs(strFileName.c_str());  //DocSegment Tianwang.raw.****
    if (!ifs) 
    {
        cerr << "Cannot open tianwang.img.info for input\n";
        return -1;
    }

    ifstream ifsUrl("Url.idx.sort_uniq");   //排序q消重后的url字典
    if (!ifsUrl) 
    {
        cerr << "Cannot open Url.idx.sort_uniq for input\n";
        return -1;
    }
    ifstream ifsDoc("Doc.idx"); //字典文g
    if (!ifsDoc) 
    {
        cerr << "Cannot open Doc.idx for input\n";
        return -1;
    }

    while (getline(ifsUrl,strLine)) //偏离url字典存入一个向量内存中
    {
        char chksum[33];
        int  docid;

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%s%d", chksum, &docid );
        iUrl.m_sChecksum = chksum;
        iUrl.m_nDocId = docid;
        vecCUrl.push_back(iUrl);
    }

    while (getline(ifsDoc,strLine))     //偏离字典文g其攑օ一个向量内存中
    {
        int docid,pos,length;
        char chksum[33];

        memset(chksum, 0, 33);
        sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
        iDocument.m_nDocId = docid;
        iDocument.m_nPos = pos;
        iDocument.m_nLength = length;
        iDocument.m_sChecksum = chksum;
        vecCDocument.push_back(iDocument);
    }

 

    strFileName += ".seg";
    ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary);    //讄完成分词后的数据输出文g
    for ( docId=0; docId<MAX_DOC_ID; docId++ )
    {

        // find document according to docId
        int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
        char *pContent = new char[length+1];
        memset(pContent, 0, length+1);
        ifs.seekg(vecCDocument[docId].m_nPos);
        ifs.read(pContent, length);

        char *s;
        s = pContent;

        // skip Head
        int bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) 
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;


        // skip header
        bytesRead = 0,newlines = 0;
        while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1) 
        {
            if (*s == '\n')
                newlines++;
            else
                newlines = 0;
            s++;
            bytesRead++;
        }
        if (bytesRead == HEADER_BUF_SIZE-1) continue;

        //iDocument.m_sBody = s;
        iDocument.RemoveTags(s);    //去除<>
        iDocument.m_sBodyNoTags = s;

        delete[] pContent;
        string strLine = iDocument.m_sBodyNoTags;

        CStrFun::ReplaceStr(strLine, " ", " ");
        CStrFun::EmptyStr(strLine); // set " \t\r\n" to " "


        // segment the document 具体分词处理
        CHzSeg iHzSeg;
        strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
        fout << docId << endl << strLine;
        fout << endl;
        
    }

    return(0);
}
q里只是光掠媄(jing)式的q一遍大概的代码Q后面我?x)有专题详细讲?parse html ?segment docment {技?/p>

 

 



]]>
自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[6]倒排索引的徏立的E序分析(3) http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102948.html学者站在巨人的肩膀?/dc:creator>学者站在巨人的肩膀?/author>Thu, 10 Dec 2009 15:02:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102948.htmlhttp://www.shnenglu.com/jrckkyy/comments/102948.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102948.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102948.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102948.htmlq里介绍正向索引的徏立,如果直接建立倒排索引效率上可能会(x)很低Q所以可以先产生正向索引为后面的倒排索引打下基础?/p>

 

详细的文件功能和介绍都在q里有了介绍自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[5]倒排索引的徏立及(qing)文g介绍

 

CrtForwardIdx.cpp文g

 

int main(int argc, char* argv[])    //./CrtForwardIdx Tianwang.raw.***.seg > moon.fidx
{
    ifstream ifsImgInfo(argv[1]);
    if (!ifsImgInfo) 
    {
        cerr << "Cannot open " << argv[1] << " for input\n";
        return -1;
    }

    string strLine,strDocNum;
    int cnt = 0;
    while (getline(ifsImgInfo, strLine)) 
    {
        string::size_type idx;

        cnt++;
        if (cnt%2 == 1) //奇数行ؓ(f)文档~号
        {
            strDocNum = strLine.substr(0,strLine.size());
            continue;
        }
        if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
        {
            continue;
        }

        while ( (idx = strLine.find(SEPARATOR)) != string::npos ) //指定查找分界W?
        {
            string tmp1 = strLine.substr(0,idx);
            cout << tmp1 << "\t" << strDocNum << endl;
            strLine = strLine.substr(idx + SEPARATOR.size());
        }

        //if (cnt==100) break;
    }

    return 0;
}

 

author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

 

 



]]>
自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[6]倒排索引的徏立的E序分析(1)http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102945.html学者站在巨人的肩膀?/dc:creator>学者站在巨人的肩膀?/author>Thu, 10 Dec 2009 15:00:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102945.htmlhttp://www.shnenglu.com/jrckkyy/comments/102945.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102945.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102945.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102945.htmlauthor:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

上一主要介l了倒排索引建立相关的文件及(qing)中间文g?br>TSE建立索引在运行程序上的大致步骤可以简化分Z下几步:(x)

1、运行命?./DocIndex
?x)用C个文?tianwang.raw.520    //爬取回来的原始文Ӟ包含多个|页的所有信息,所以很大,q也是一个有待解决的问题Q到底存成大文gQ如果过大会(x)过2G?G的限Ӟ而且文gq大索引效率q低Q还是小文gQ文件数q多用于打开关闭文g句柄的消耗过大)q有待思考,q就是存储方案的解决最l肯定是要存为分布式的,最lL仉肯定是会(x)上TB的,TSE只支持小型的搜烦(ch)引擎需求?nbsp;         
?x)生一下三个文?Doc.idx, Url.idx, DocId2Url.idx    //Data文g夹中的Doc.idx DocId2Url.idx和Doc.idx

2、运行命?sort Url.idx|uniq > Url.idx.sort_uniq    //Data文g夹中的Url.idx.sort_uniq
?x)用C个文?Url.idx文g //md5 hash 之后的url完整地址和document id值对
?x)生一个文?Url.idx.sort_uniq //URL消重Qmd5 hash排序Q提高检索效?/p>

3、运行命?./DocSegment Tianwang.raw.2559638448 
?x)用C个文?Tianwang.raw.2559638448  //Tianwang.raw.2559638448为爬回来的文?Q每个页面包含http_(d)分词为后面徏立到排烦(ch)引做准备
?x)生一个文?Tianwang.raw.2559638448.seg //分词文gQ由一行document id号和一行文档分词组Q只Ҏ(gu)个文?lt;html></html>?lt;head></head><body></body>{文字标C的文本进行分l)构成

4、运行命?./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx //建立独立的正向烦(ch)?/p>

5、运行命?br>#set | grep "LANG"
#LANG=en; export LANG;
#sort moon.fidx > moon.fidx.sort

6、运行命?./CrtInvertedIdx moon.fidx.sort > sun.iidx //建立倒排索引

我们先从建立索引的第一个程序DocIndex.cpp开始分析?注释U定QTianwang.raw.2559638448是抓回来合ƈ成的大文Ӟ后面叫大文Ӟ里面包含了很多篇html文档Q里面的文档有规律的分隔叫做一一的文档)


//DocIndex.h start-------------------------------------------------------------

 


#ifndef _COMM_H_040708_
#define _COMM_H_040708_

#include

#include
#include
#include
#include
#include
#include
#include


using namespace std;

const unsigned HEADER_BUF_SIZE = 1024;
const unsigned RstPerPage = 20; //前台搜烦(ch)l果数据集返回条?/p>

//iceway
//const unsigned MAX_DOC_IDX_ID = 21312;  //DocSegment.cpp中要用到
const unsigned MAX_DOC_IDX_ID = 22104;


//const string IMG_INFO_NAME("./Data/s1.1");
const string INF_INFO_NAME("./Data/sun.iidx"); //倒排索引文g
//朱d  14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
//朱古?nbsp; 1085 1222

//9万多?字元文g 包括Ҏ(gu)W号Q标点,汉字
const string DOC_IDX_NAME("./Data/Doc.idx"); //倒排索引文g
const string RAWPAGE_FILE_NAME("./Data/Tianwang.swu.iceway.1.0");

//iceway
const string DOC_FILE_NAME = "Tianwang.swu.iceway.1.0";  //Docindex.cpp中要用到
const string Data_DOC_FILE_NAME = "./Data/Tianwang.swu.iceway.1.0";  //Snapshot.cpp中要用到


//const string RM_THUMBNAIL_FILES("rm -f ~/public_html/ImgSE/timg/*");

//const string THUMBNAIL_DIR("/ImgSE/timg/");


#endif _COMM_H_040708_
//DocIndex.h end--------------------------------------------------------------//DocIndex.cpp start-----------------------------------------------------------

#include
#include
#include "Md5.h"
#include "Url.h"
#include "Document.h"

//iceway(mnsc)
#include "Comm.h"
#include

using namespace std;

int main(int argc, char* argv[])
{
    //ifstream ifs("Tianwang.raw.2559638448");
 //ifstream ifs("Tianwang.raw.3023555472");
 //iceway(mnsc)
 ifstream ifs(DOC_FILE_NAME.c_str()); //打开Tianwang.raw.3023555472文gQ最原始的文?br> if (!ifs)
 {
     cerr << "Cannot open " << "tianwang.img.info" << " for input\n";
     return -1;
    }
 ofstream ofsUrl("Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立q打开Url.idx文g
 if( !ofsUrl )
 {
  cout << "error open file " << endl;
 }

 ofstream ofsDoc("Doc.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立q打开Doc.idx文g
 if( !ofsDoc )
 {
  cout << "error open file " << endl;
 }

 ofstream ofsDocId2Url("DocId2Url.idx", ios::in|ios::out|ios::trunc|ios::binary); //建立q打开DocId2Url.idx文g
 if( !ofsDocId2Url )
 {
  cout << "error open file " << endl;
 }

 int cnt=0; //文档~号?开始计?br> string strLine,strPage;
 CUrl iUrl;
 CDocument iDocument;
 CMD5 iMD5;
 
 int nOffset = ifs.tellg();
 while (getline(ifs, strLine))
 {
  if (strLine[0]=='\0' || strLine[0]=='#' || strLine[0]=='\n')
  {
   nOffset = ifs.tellg();
   continue;
  }

  if (!strncmp(strLine.c_str(), "version: 1.0", 12)) //判断W一行是否是version: 1.0如果是就解析下去
  { 
   if(!getline(ifs, strLine)) break;
   if (!strncmp(strLine.c_str(), "url: ", 4)) //判断W二行是否是url: 如果是则解析下去
   {
    iUrl.m_sUrl = strLine.substr(5); //截取url: 五个字符之后的url内容
    iMD5.GenerateMD5( (unsigned char*)iUrl.m_sUrl.c_str(), iUrl.m_sUrl.size() ); //对url用md5 hash处理
    iUrl.m_sChecksum = iMD5.ToString(); //字W数l组合成字符串这个函数在Md5.h中实?/p>

   } else
   {
    continue;
   }

   while (getline(ifs, strLine))
   {
    if (!strncmp(strLine.c_str(), "length: ", 8)) //一直读下去直到判断Ҏ(gu)(相对W五?惺欠袯ength: 是则接下下去
    {
     sscanf(strLine.substr(8).c_str(), "%d", &(iDocument.m_nLength)); //该块所代表|页的实际网内定w度放入iDocument数据l构?br>     break;
    }
   }

   getline(ifs, strLine); //跌相对W六行故意留的一个空?/p>

   iDocument.m_nDocId = cnt; //文档编可值到iDocument数据l构?br>   iDocument.m_nPos = nOffset; //文档l尾在大文g中的l束行号
   char *pContent = new char[iDocument.m_nLength+1]; //新徏该文档长度的字符串指?/p>

   memset(pContent, 0, iDocument.m_nLength+1); //每一位初始化?
   ifs.read(pContent, iDocument.m_nLength); //Ҏ(gu)获得的文档长度读取澹(其中包含协议?d文档内容
   iMD5.GenerateMD5( (unsigned char*)pContent, iDocument.m_nLength );
   iDocument.m_sChecksum = iMD5.ToString(); //字W数l组合成字符串这个函数在Md5.h中实?br>   
   delete[] pContent;
   
   ofsUrl << iUrl.m_sChecksum ; //md5hash后的url写入Url.idx文g
   ofsUrl << "\t" << iDocument.m_nDocId << endl; //在一行中一个tab距离分隔Q将文g~号写入Url.idx文g

   ofsDoc << iDocument.m_nDocId ; //文件编号写入Doc.idx文g
   ofsDoc << "\t" << iDocument.m_nPos ; //在一行中一个tab距离分隔Q将该文档结束行h(同样也是下一文档开始行?写入Doc.idx文g
   //ofsDoc << "\t" << iDocument.m_nLength ;
   ofsDoc << "\t" << iDocument.m_sChecksum << endl; //在一行中一个tab距离分隔Q将md5hash后的url写入Doc.idx文g

   ofsDocId2Url << iDocument.m_nDocId ; //文件编号写入DocId2Url.idx文g
   ofsDocId2Url << "\t" << iUrl.m_sUrl << endl; //该文档的完整url写入DocId2Url.idx文g

   cnt++; //文档~号加一说明该以文档分析完毕Q生成下一文档的编?br>  }

  nOffset = ifs.tellg();

 }

 //最后一行只有文档号和上一文档结束号
 ofsDoc << cnt ;
 ofsDoc << "\t" << nOffset << endl;


 return(0);
}

//DocIndex.cpp end-----------------------------------------------------------author:http://hi.baidu.com/jrckkyy

author:http://blog.csdn.net/jrckkyy

 

 



]]>
自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[5]倒排索引的徏立及(qing)文g介绍http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102943.html学者站在巨人的肩膀?/dc:creator>学者站在巨人的肩膀?/author>Thu, 10 Dec 2009 14:55:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102943.htmlhttp://www.shnenglu.com/jrckkyy/comments/102943.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102943.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102943.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102943.html不好意思让大家久等了,前一阵一直在忙考试Q终于结束了。呵呵!废话不多说了下面我们开始吧Q?/p>

TSE用的是将抓取回来的网|档全部装入一个大文档Q让后对q一个大文档内的数据整体l一的徏索引Q其中包含了几个步骤?/p>

view plaincopy to clipboardprint?
1.  The document index (Doc.idx) keeps information about each document.  
 
It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.  
 
The information stored in each entry includes a pointer into the repository,  
 
a document length, a document checksum.  
 
 
 
//Doc.idx  文档~号 文档长度    checksum hash?nbsp; 
 
0   0   bc9ce846d7987c4534f53d423380ba70  
 
1   76760   4f47a3cad91f7d35f4bb6b2a638420e5  
 
2   141624  d019433008538f65329ae8e39b86026c  
 
3   142350  5705b8f58110f9ad61b1321c52605795  
 
//Doc.idx   end  
 
 
 
  The url index (url.idx) is used to convert URLs into docIDs.  
 
 
 
//url.idx  
 
5c36868a9c5117eadbda747cbdb0725f    0 
 
3272e136dd90263ee306a835c6c70d77    1 
 
6b8601bb3bb9ab80f868d549b5c5a5f3    2 
 
3f9eba99fa788954b5ff7f35a5db6e1f    3 
 
//url.idx   end  
 
 
 
It is a list of URL checksums with their corresponding docIDs and is sorted by  
 
checksum. In order to find the docID of a particular URL, the URL's checksum  
 
is computed and a binary search is performed on the checksums file to find its  
 
docID.  
 
 
 
    ./DocIndex  
 
        got Doc.idx, Url.idx, DocId2Url.idx //Data文g夹中的Doc.idx DocId2Url.idx和Doc.idx?nbsp; 
 
 
 
//DocId2Url.idx  
 
0   http://*.*.edu.cn/index.aspx  
 
1   http://*.*.edu.cn/showcontent1.jsp?NewsID=118  
 
2   http://*.*.edu.cn/0102.html  
 
3   http://*.*.edu.cn/0103.html  
 
//DocId2Url.idx end  
 
 
 
2.  sort Url.idx|uniq > Url.idx.sort_uniq    //Data文g夹中的Url.idx.sort_uniq  
 
 
 
//Url.idx.sort_uniq  
 
//对hashD行排?nbsp; 
 
000bfdfd8b2dedd926b58ba00d40986b    1111 
 
000c7e34b653b5135a2361c6818e48dc    1831 
 
0019d12f438eec910a06a606f570fde8    366 
 
0033f7c005ec776f67f496cd8bc4ae0d    2103 
 
 
 
3. Segment document to terms, (with finding document according to the url)  
 
    ./DocSegment Tianwang.raw.2559638448        //Tianwang.raw.2559638448为爬回来的文?Q每个页面包含http?nbsp; 
 
        got Tianwang.raw.2559638448.seg       
 
 
 
//Tianwang.raw.2559638448   爬取的原始网|件在文档内部每一个文档之间应该是通过versionQ?lt;/html>和回车做标志位分割的  
 
version: 1.0 
 
url: http://***.105.138.175/Default2.asp?lang=gb  
 
origin: http://***.105.138.175/  
 
date: Fri, 23 May 2008 20:01:36 GMT  
 
ip: 162.105.138.175 
 
length: 38413 
 
 
 
HTTP/1.1 200 OK  
 
Server: Microsoft-IIS/5.0 
 
Date: Fri, 23 May 2008 11:17:49 GMT  
 
Connection: keep-alive  
 
Connection: Keep-Alive  
 
Content-Length: 38088 
 
Content-Type: text/html; Charset=gb2312  
 
Expires: Fri, 23 May 2008 11:17:49 GMT  
 
Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/  
 
Cache-control: private 
 
 
 
 
 
 
 
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 
 
" 
<html>  
 
<head>  
 
<title>Apabi数字资源q_</title>  
 
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">  
 
<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">  
 
<META NAME="DESCRIPTION" CONTENT="数字图书?Ҏ(gu)数字图书??sh)子图??sh)子?ebook e?Apabi 数字资源q_">  
 
<link rel="stylesheet" type="text/css" href="css\common.css">  
 
 
 
<style type="text/css">  
 
<!--  
 
.style4 {color: #666666}  
 
-->  
 
</style>  
 
 
 
<script LANGUAGE="vbscript">  
 
...  
 
</script>  
 
 
 
<Script Language="javascript">  
 
...  
 
</Script>  
 
</head>  
 
<body leftmargin="0" topmargin="0">  
 
</body>  
 
</html>  
 
//Tianwang.raw.2559638448   end  
 
 
 
//Tianwang.raw.2559638448.seg   每个页面分成一行如?注意中间没有回R作ؓ(f)分隔)  
 

 
...  
 
...  
 
...  
 

 
...  
 
...  
 
...  
 
//Tianwang.raw.2559638448.seg   end  
 
 
 
//下是 Tiny search 非必d?nbsp; 
 
4. Create forward index (docic-->termid)     //建立正向索引  
 
    ./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx  
 
 
 
//Tianwang.raw.2559638448.seg 每个页面分成一行如?lt;BR>//分词   DocID<BR>1<BR>三星/  s/  手机/  论坛/  ,/  手机/  铃声/  下蝲/  ,/  手机/  囄/  下蝲/  ,/  手机/<BR>2<BR>...<BR>...<BR>... 

1.  The document index (Doc.idx) keeps information about each document.

It is a fixed width ISAM (Index sequential access mode) index, orderd by docID.

The information stored in each entry includes a pointer into the repository,

a document length, a document checksum.

 

//Doc.idx  文档~号 文档长度 checksum hash?/p>

0 0 bc9ce846d7987c4534f53d423380ba70

1 76760 4f47a3cad91f7d35f4bb6b2a638420e5

2 141624 d019433008538f65329ae8e39b86026c

3 142350 5705b8f58110f9ad61b1321c52605795

//Doc.idx end

 

  The url index (url.idx) is used to convert URLs into docIDs.

 

//url.idx

5c36868a9c5117eadbda747cbdb0725f 0

3272e136dd90263ee306a835c6c70d77 1

6b8601bb3bb9ab80f868d549b5c5a5f3 2

3f9eba99fa788954b5ff7f35a5db6e1f 3

//url.idx end

 

It is a list of URL checksums with their corresponding docIDs and is sorted by

checksum. In order to find the docID of a particular URL, the URL's checksum

is computed and a binary search is performed on the checksums file to find its

docID.

 

 ./DocIndex

  got Doc.idx, Url.idx, DocId2Url.idx //Data文g夹中的Doc.idx DocId2Url.idx和Doc.idx?/p>

 

//DocId2Url.idx

http://*.*.edu.cn/index.aspx

http://*.*.edu.cn/showcontent1.jsp?NewsID=118

http://*.*.edu.cn/0102.html

http://*.*.edu.cn/0103.html

//DocId2Url.idx end

 

2.  sort Url.idx|uniq > Url.idx.sort_uniq //Data文g夹中的Url.idx.sort_uniq

 

//Url.idx.sort_uniq

//对hashD行排?/p>

000bfdfd8b2dedd926b58ba00d40986b 1111

000c7e34b653b5135a2361c6818e48dc 1831

0019d12f438eec910a06a606f570fde8 366

0033f7c005ec776f67f496cd8bc4ae0d 2103

 

3. Segment document to terms, (with finding document according to the url)

 ./DocSegment Tianwang.raw.2559638448  //Tianwang.raw.2559638448为爬回来的文?Q每个页面包含http?/p>

  got Tianwang.raw.2559638448.seg  

 

//Tianwang.raw.2559638448 爬取的原始网|件在文档内部每一个文档之间应该是通过versionQ?lt;/html>和回车做标志位分割的

version: 1.0

url: http://***.105.138.175/Default2.asp?lang=gb

origin: http://***.105.138.175/

date: Fri, 23 May 2008 20:01:36 GMT

ip: 162.105.138.175

length: 38413

 

HTTP/1.1 200 OK

Server: Microsoft-IIS/5.0

Date: Fri, 23 May 2008 11:17:49 GMT

Connection: keep-alive

Connection: Keep-Alive

Content-Length: 38088

Content-Type: text/html; Charset=gb2312

Expires: Fri, 23 May 2008 11:17:49 GMT

Set-Cookie: ASPSESSIONIDSSTRDCAB=IMEOMBIAIPDFCKPAEDJFHOIH; path=/

Cache-control: private

 

 

 

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

"

<html>

<head>

<title>Apabi数字资源q_</title>

<meta http-equiv="Content-Type" content="text/html; charset=gb2312">

<META NAME="ROBOTS" CONTENT="INDEX,NOFOLLOW">

<META NAME="DESCRIPTION" CONTENT="数字图书?Ҏ(gu)数字图书??sh)子图??sh)子?ebook e?Apabi 数字资源q_">

<link rel="stylesheet" type="text/css" href="css\common.css">

 

<style type="text/css">

<!--

.style4 {color: #666666}

-->

</style>

 

<script LANGUAGE="vbscript">

...

</script>

 

<Script Language="javascript">

...

</Script>

</head>

<body leftmargin="0" topmargin="0">

</body>

</html>

//Tianwang.raw.2559638448 end

 

//Tianwang.raw.2559638448.seg 每个页面分成一行如?注意中间没有回R作ؓ(f)分隔)

1

...

...

...

2

...

...

...

//Tianwang.raw.2559638448.seg end

 

//下是 Tiny search 非必d?/p>

4. Create forward index (docic-->termid)  //建立正向索引

 ./CrtForwardIdx Tianwang.raw.2559638448.seg > moon.fidx

 

//Tianwang.raw.2559638448.seg 每个页面分成一行如?/分词   DocID1三星/  s/  手机/  论坛/  ,/  手机/  铃声/  下蝲/  ,/  手机/  囄/  下蝲/  ,/  手机/2.........view plaincopy to clipboardprint?
//Tianwang.raw.2559638448.seg end  
 
 
//moon.fidx  
 
//每篇文档号对应文档内分出来的    分词  DocID  
 
都会(x)  2391 
 
?nbsp;  2391 
 
那些  2391 
 
拥有  2391 
 
?nbsp;  2391 
 
?nbsp;  2391 
 
?nbsp;  2391 
 
?nbsp;  2391 
 
视野  2391 
 
?nbsp;  2391 
 
H?nbsp;  2391 
 
?nbsp;  2180 
 
研究生部    2180 
 
主页  2180 
 
培养  2180 
 
理  2180 
 
栏目  2180 
 
下蝲  2180 
 
Q?nbsp;  2180 
 
?nbsp;  2180 
 
关于  2180 
 
做好  2180 
 
q?nbsp;  2180 
 
国家  2180 
 
公派  2180 
 
研究?2180 
 
目  2180 
 
//moon.fidx end  
 
 
 
5.# set | grep "LANG" 
 
LANG=en; export LANG;  
 
sort moon.fidx > moon.fidx.sort  
 
 
 
6. Create inverted index (termid-->docid)    //建立倒排索引  
 
    ./CrtInvertedIdx moon.fidx.sort > sun.iidx  
 
 
 
//sun.iidx  //文g规模大概减少1/2  
 
花工   236 
 
花v   2103 
 
花卉   1018 1061 1061 1061 1730 1730 1730 1730 1730 1852 949 949 
 
p   447 447 
 
花木   1061 
 
花呢   1430 
 
花期   447 447 447 447 447 525 
 
花钱   174 236 
 
p   1730 1730 
 
p品种     1660 
 
q   450 526 
 
花式   1428 1430 1430 1430 
 
q   1430 1430 
 
花序   447 447 447 447 447 450 
 
qQ   136 137 
 
p   450 450 
 
//sun.iidx  end  
 
 
 
TSESearch   CGI program for query  
 
Snapshot    CGI program for page snapshot  
 
 
<P>  
author:http://hi.baidu.com/jrckkyy  
 
author:http://blog.csdn.net/jrckkyy  
</P> 

 



]]>
自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[4]结http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102942.html学者站在巨人的肩膀?/dc:creator>学者站在巨人的肩膀?/author>Thu, 10 Dec 2009 14:54:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102942.htmlhttp://www.shnenglu.com/jrckkyy/comments/102942.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102942.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102942.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102942.html通过前面的三文章相信你已经对神U的搜烦(ch)引擎有了一个感性的认识Q和普通的phpcM的脚本语a服务器类|通过获取前台关键字,通过字典分词Q和事先建立建立好的倒排索引q行相关性分析,得出查询l构格式化输出结果。而这里的技术难点在?/p>

1、字典的选取Q事实上Ҏ(gu)不同时代不同地方Z的语a?fn)惯是不一L(fng)所以说字典的最元的取值是不同的)

2、倒排索引的徏立(q里p涉及(qing)到爬虫的抓取和烦(ch)引的建立后面重点介l这2点,搜烦(ch)引擎的效率和服务质量实效性瓶颈在q里Q?/p>

3、相x分析(Ҏ(gu)回来的文档分词徏索引和用户关键字分词法上要对应Q?/p>

后面文章?x)重点介l爬虫的抓取和烦(ch)引的建立?/p>

]]>
自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[3]来到关键字分词及(qing)相关性分析程?http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102941.html学者站在巨人的肩膀?/dc:creator>学者站在巨人的肩膀?/author>Thu, 10 Dec 2009 14:53:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102941.htmlhttp://www.shnenglu.com/jrckkyy/comments/102941.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102941.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102941.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102941.html有前面注释我们可以知道查询关键字和字典文件准备好好后Q将q入用户关键字分词阶D?/p>

//TSESearch.cpp中:(x)

view plaincopy to clipboardprint?
CHzSeg iHzSeg;      //include ChSeg/HzSeg.h  
 
//  
iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery);  //get到的查询变量分词分成 "?        ?      你们/ ?      格式"  
 
vector<STRING></STRING> vecTerm;  
iQuery.ParseQuery(vecTerm);     //以"/"划分开的关键字一一序攑օ一个向量容器中  
 
set<STRING></STRING> setRelevantRst;   
iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);   
 
gettimeofday(&end_tv,&tz);  
// search end  
//搜烦(ch)完毕 

 CHzSeg iHzSeg;  //include ChSeg/HzSeg.h

 //
 iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //get到的查询变量分词分成 "?  ?  你们/ ?  格式"
 
 vector vecTerm;
 iQuery.ParseQuery(vecTerm);  //以"/"划分开的关键字一一序攑օ一个向量容器中
 
 set setRelevantRst;
 iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
 
 gettimeofday(&end_tv,&tz);
 // search end
 //搜烦(ch)完毕view plaincopy to clipboardprint?
看CHzSeg 中的q个Ҏ(gu) 

看CHzSeg 中的q个Ҏ(gu)view plaincopy to clipboardprint?
//ChSeg/HzSeg.h 

//ChSeg/HzSeg.hview plaincopy to clipboardprint?
/**  
 * E序译说明  
 * q一步净化数据,转换汉字  
 * @access  public  
 * @param   CDict, string 参数的汉字说?字典Q查询字W串  
 * @return  string 0  
 */  
// process a sentence before segmentation  
//在分词前处理句子  
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const  
{  
    string s2="";  
    unsigned int i,len;  
 
    while (!s1.empty())   
    {  
        unsigned char ch=(unsigned char) s1[0];  
        if(ch<128)   
        { // deal with ASCII  
            i=1;  
            len = s1.size();  
            while (i<LEN len="s1.length();" i="0;" 中文标点{非汉字字符="" if="" else="" yhf="" s1="s1.substr(i);" by="" added="" ch="=13)" s2="" cr=""></LEN>=161)  
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))  
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))  
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)   
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186  
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))   
                {   
                    ii=i+2; // 假定没有半个汉字  
                }  
 
                if (i==0) ii=i+2;  
 
                // 不处理中文空?nbsp; 
                if (!(ch==161 && (unsigned char)s1[1]==161))   
                {   
                    if (i <= s1.size())  // yhf  
                        // 其他的非汉字双字节字W可能连l输?nbsp; 
                        s2 += s1.substr(0, i) + SEPARATOR;   
                    else break; // yhf  
                }  
 
                if (i <= s1.size())  // yhf  
                    s1s1=s1.substr(i);  
                else break;     //yhf  
 
                continue;  
            }  
        }  
      
 
    // 以下处理汉字?nbsp; 
 
        i = 2;  
        len = s1.length();  
 
        while(i<LEN></LEN>=176)   
//    while(i<LEN></LEN>=128 && (unsigned char)s1[i]!=161)  
            i+=2;  
 
        s2+=SegmentHzStrMM(dict, s1.substr(0,i));  
 
        if (i <= len)    // yhf  
            s1s1=s1.substr(i);  
        else break; // yhf  
    }  
 
    return s2;  

/**
 * E序译说明
 * q一步净化数据,转换汉字
 * @access  public
 * @param   CDict, string 参数的汉字说?字典Q查询字W串
 * @return  string 0
 */
// process a sentence before segmentation
//在分词前处理句子
string CHzSeg::SegmentSentenceMM (CDict &dict, string s1) const
{
 string s2="";
 unsigned int i,len;

 while (!s1.empty())
 {
  unsigned char ch=(unsigned char) s1[0];
  if(ch<128)
  { // deal with ASCII
   i=1;
   len = s1.size();
   while (i=161)
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=162 && (unsigned char)s1[i+1]<=168)))
              && (!((unsigned char)s1[i]==161 && ((unsigned char)s1[i+1]>=171 && (unsigned char)s1[i+1]<=191)))
              && (!((unsigned char)s1[i]==163 && ((unsigned char)s1[i+1]==172 || (unsigned char)s1[i+1]==161)
              || (unsigned char)s1[i+1]==168 || (unsigned char)s1[i+1]==169 || (unsigned char)s1[i+1]==186
              || (unsigned char)s1[i+1]==187 || (unsigned char)s1[i+1]==191)))
    {
     i=i+2; // 假定没有半个汉字
    }

    if (i==0) i=i+2;

    // 不处理中文空?br>    if (!(ch==161 && (unsigned char)s1[1]==161))
    {
     if (i <= s1.size()) // yhf
      // 其他的非汉字双字节字W可能连l输?br>      s2 += s1.substr(0, i) + SEPARATOR;
     else break; // yhf
    }

    if (i <= s1.size()) // yhf
     s1=s1.substr(i);
    else break;  //yhf

    continue;
   }
  }
   

    // 以下处理汉字?/p>

  i = 2;
  len = s1.length();

  while(i=176)
//    while(i=128 && (unsigned char)s1[i]!=161)
   i+=2;

  s2+=SegmentHzStrMM(dict, s1.substr(0,i));

  if (i <= len) // yhf
   s1=s1.substr(i);
  else break; // yhf
 }

 return s2;
}view plaincopy to clipboardprint?
  

 view plaincopy to clipboardprint?
//Query.cpp 

//Query.cppview plaincopy to clipboardprint?
<PRE class=csharp name="code">/**  
 * E序译说明  
 * 以"/"划分开的关键字一一序攑օ一个向量容器中  
 *  
 * @access  public  
 * @param   vector<STRING></STRING> 参数的汉字说明:(x)向量容器  
 * @return  void  
 */  
void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)  
{  
    string::size_type idx;   
    while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {   
        vecTerm.push_back(m_sSegQuery.substr(0,idx));   
        m_sSegQuerym_sSegQuery = m_sSegQuery.substr(idx+3);   
    }  
}  
</PRE> 
<PRE class=csharp name="code"> </PRE> 
<PRE class=csharp name="code"><PRE class=csharp name="code">/**  
 * E序译说明  
 * 相关性分析查询,构造结果集合setRelevantRst //瓉所?nbsp; 
 *  
 * @access  public  
 * @param   vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明:(x) 用户提交关键字的分词l,倒排索引映射Q相x结果集?nbsp; 
 * @return  string 0  
 */  
bool CQuery::GetRelevantRst  
(  
    vector<STRING></STRING> &vecTerm,   
    map &mapBuckets,   
    set<STRING></STRING> &setRelevantRst  
) const  
{  
    set<STRING></STRING> setSRst;  
 
    bool bFirst=true;  
    vector<STRING></STRING>::iterator itTerm = vecTerm.begin();  
 
    for ( ; itTerm != vecTerm.end(); ++itTerm )  
    {  
 
        setSRst.clear();  
        copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));  
 
        map mapRstDoc;  
        string docid;  
        int doccnt;  
 
        map::iterator itBuckets = mapBuckets.find(*itTerm);  
        if (itBuckets != mapBuckets.end())  
        {  
            string strBucket = (*itBuckets).second;  
            string::size_type idx;  
            idx = strBucket.find_first_not_of(" ");  
            strBucketstrBucket = strBucket.substr(idx);  
 
            while ( (idx = strBucket.find(" ")) != string::npos )   
            {  
                docid = strBucket.substr(0,idx);  
                doccnt = 0;  
 
                if (docid.empty()) continue;  
 
                map::iterator it = mapRstDoc.find(docid);  
                if ( it != mapRstDoc.end() )  
                {  
                    doccnt = (*it).second + 1;  
                    mapRstDoc.erase(it);  
                }  
                mapRstDoc.insert( pair(docid,doccnt) );  
 
                strBucketstrBucket = strBucket.substr(idx+1);  
            }  
 
            // remember the last one  
            docid = strBucket;  
            doccnt = 0;  
            map::iterator it = mapRstDoc.find(docid);  
            if ( it != mapRstDoc.end() )  
            {  
                doccnt = (*it).second + 1;  
                mapRstDoc.erase(it);  
            }  
            mapRstDoc.insert( pair(docid,doccnt) );  
        }  
 
        // sort by term frequencty  
        multimap > newRstDoc;  
        map::iterator it0 = mapRstDoc.begin();  
        for ( ; it0 != mapRstDoc.end(); ++it0 ){  
            newRstDoc.insert( pair((*it0).second,(*it0).first) );  
        }  
 
        multimap::iterator itNewRstDoc = newRstDoc.begin();  
        setRelevantRst.clear();  
        for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){  
            string docid = (*itNewRstDoc).second;  
 
            if (bFirst==true) {  
                setRelevantRst.insert(docid);  
                continue;  
            }  
 
            if ( setSRst.find(docid) != setSRst.end() ){      
                setRelevantRst.insert(docid);  
            }  
        }  
 
        //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";  
        bFirst = false;  
    }  
    return true;  
}</PRE> 
</PRE> 
接下来的是现实了,前面都只是处理数据得?setRelevantRst q个查询l构集合,q里׃多说了下面就和php之类的脚本语a差不多,格式化结果集合ƈ昄出来?nbsp;

view plaincopy to clipboardprint?/**   * E序译说明   * 以"/"划分开的关键字一一序攑օ一个向量容器中   *   * @access  public   * @param   vector<STRING></STRING> 参数的汉字说明:(x)向量容器   * @return  void   */  void CQuery::ParseQuery(vector<STRING></STRING> &vecTerm)   {       string::size_type idx;        while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {            vecTerm.push_back(m_sSegQuery.substr(0,idx));            m_sSegQuery = m_sSegQuery.substr(idx+3);        }   }  /**
 * E序译说明
 * 以"/"划分开的关键字一一序攑օ一个向量容器中
 *
 * @access  public
 * @param   vector 参数的汉字说明:(x)向量容器
 * @return  void
 */
void CQuery::ParseQuery(vector &vecTerm)
{
 string::size_type idx;
 while ( (idx = m_sSegQuery.find("/  ")) != string::npos ) {
  vecTerm.push_back(m_sSegQuery.substr(0,idx));
  m_sSegQuery = m_sSegQuery.substr(idx+3);
 }
}

view plaincopy to clipboardprint?   
view plaincopy to clipboardprint?<PRE class=csharp name="code">/**   * E序译说明   * 相关性分析查询,构造结果集合setRelevantRst //瓉所?nbsp;  *   * @access  public   * @param   vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明:(x) 用户提交关键字的分词l,倒排索引映射Q相x结果集?nbsp;  * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const  {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   }</PRE>  view plaincopy to clipboardprint?/**   * E序译说明   * 相关性分析查询,构造结果集合setRelevantRst //瓉所?nbsp;  *   * @access  public   * @param   vector<STRING></STRING> map set<STRING></STRING> 参数的汉字说明:(x) 用户提交关键字的分词l,倒排索引映射Q相x结果集?nbsp;  * @return  string 0   */  bool CQuery::GetRelevantRst   (       vector<STRING></STRING> &vecTerm,        map &mapBuckets,        set<STRING></STRING> &setRelevantRst   ) const  {       set<STRING></STRING> setSRst;         bool bFirst=true;       vector<STRING></STRING>::iterator itTerm = vecTerm.begin();         for ( ; itTerm != vecTerm.end(); ++itTerm )       {             setSRst.clear();           copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));             map mapRstDoc;           string docid;           int doccnt;             map::iterator itBuckets = mapBuckets.find(*itTerm);           if (itBuckets != mapBuckets.end())           {               string strBucket = (*itBuckets).second;               string::size_type idx;               idx = strBucket.find_first_not_of(" ");               strBucket = strBucket.substr(idx);                 while ( (idx = strBucket.find(" ")) != string::npos )                {                   docid = strBucket.substr(0,idx);                   doccnt = 0;                     if (docid.empty()) continue;                     map::iterator it = mapRstDoc.find(docid);                   if ( it != mapRstDoc.end() )                   {                       doccnt = (*it).second + 1;                       mapRstDoc.erase(it);                   }                   mapRstDoc.insert( pair(docid,doccnt) );                     strBucket = strBucket.substr(idx+1);               }                 // remember the last one               docid = strBucket;               doccnt = 0;               map::iterator it = mapRstDoc.find(docid);               if ( it != mapRstDoc.end() )               {                   doccnt = (*it).second + 1;                   mapRstDoc.erase(it);               }               mapRstDoc.insert( pair(docid,doccnt) );           }             // sort by term frequencty           multimap > newRstDoc;           map::iterator it0 = mapRstDoc.begin();           for ( ; it0 != mapRstDoc.end(); ++it0 ){               newRstDoc.insert( pair((*it0).second,(*it0).first) );           }             multimap::iterator itNewRstDoc = newRstDoc.begin();           setRelevantRst.clear();           for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){               string docid = (*itNewRstDoc).second;                 if (bFirst==true) {                   setRelevantRst.insert(docid);                   continue;               }                 if ( setSRst.find(docid) != setSRst.end() ){                       setRelevantRst.insert(docid);               }           }             //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "<BR>";           bFirst = false;       }       return true;   }  /**
 * E序译说明
 * 相关性分析查询,构造结果集合setRelevantRst //瓉所?br> *
 * @access  public
 * @param   vector map set 参数的汉字说明:(x) 用户提交关键字的分词l,倒排索引映射Q相x结果集?br> * @return  string 0
 */
bool CQuery::GetRelevantRst
(
 vector &vecTerm,
 map &mapBuckets,
 set &setRelevantRst
) const
{
 set setSRst;

 bool bFirst=true;
 vector::iterator itTerm = vecTerm.begin();

 for ( ; itTerm != vecTerm.end(); ++itTerm )
 {

  setSRst.clear();
  copy(setRelevantRst.begin(), setRelevantRst.end(), inserter(setSRst,setSRst.begin()));

  map mapRstDoc;
  string docid;
  int doccnt;

  map::iterator itBuckets = mapBuckets.find(*itTerm);
  if (itBuckets != mapBuckets.end())
  {
   string strBucket = (*itBuckets).second;
   string::size_type idx;
   idx = strBucket.find_first_not_of(" ");
   strBucket = strBucket.substr(idx);

   while ( (idx = strBucket.find(" ")) != string::npos )
   {
    docid = strBucket.substr(0,idx);
    doccnt = 0;

    if (docid.empty()) continue;

    map::iterator it = mapRstDoc.find(docid);
    if ( it != mapRstDoc.end() )
    {
     doccnt = (*it).second + 1;
     mapRstDoc.erase(it);
    }
    mapRstDoc.insert( pair(docid,doccnt) );

    strBucket = strBucket.substr(idx+1);
   }

   // remember the last one
   docid = strBucket;
   doccnt = 0;
   map::iterator it = mapRstDoc.find(docid);
   if ( it != mapRstDoc.end() )
   {
    doccnt = (*it).second + 1;
    mapRstDoc.erase(it);
   }
   mapRstDoc.insert( pair(docid,doccnt) );
  }

  // sort by term frequencty
  multimap > newRstDoc;
  map::iterator it0 = mapRstDoc.begin();
  for ( ; it0 != mapRstDoc.end(); ++it0 ){
   newRstDoc.insert( pair((*it0).second,(*it0).first) );
  }

  multimap::iterator itNewRstDoc = newRstDoc.begin();
  setRelevantRst.clear();
  for ( ; itNewRstDoc != newRstDoc.end(); ++itNewRstDoc ){
   string docid = (*itNewRstDoc).second;

   if (bFirst==true) {
    setRelevantRst.insert(docid);
    continue;
   }

   if ( setSRst.find(docid) != setSRst.end() ){ 
    setRelevantRst.insert(docid);
   }
  }

  //cout << "setRelevantRst.size(): " << setRelevantRst.size() << "";
  bFirst = false;
 }
 return true;
}

接下来的是现实了,前面都只是处理数据得?setRelevantRst q个查询l构集合,q里׃多说了下面就和php之类的脚本语a差不多,格式化结果集合ƈ昄出来?br>//TSESearch.cpp

view plaincopy to clipboardprint?
//下面开始显C?nbsp; 
    CDisplayRst iDisplayRst;   
    iDisplayRst.ShowTop();   
 
    float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000   
        +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;   
 
    iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,   
            setRelevantRst.size(), iQuery.m_iStart);  
 
    iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

 



]]>
自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[2]路过查询处理E序http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102940.html学者站在巨人的肩膀?/dc:creator>学者站在巨人的肩膀?/author>Thu, 10 Dec 2009 14:52:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102940.htmlhttp://www.shnenglu.com/jrckkyy/comments/102940.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102940.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102940.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102940.html׃一文章[原]自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[1]L搜烦(ch)引擎入口 我们可以知道整个E序是从TSESearch.cpp 中的main函数开始的我们重点一下这D代?/p>

//TSESearch.cpp CQuery iQuery;
 iQuery.GetInputs();  //具体E序开始执?br> // current query & result page number
 iQuery.SetQuery();
 iQuery.SetStart();

 // begin to search
 //开始具体搜索程?br> gettimeofday(&begin_tv,&tz); //开始计时获取程序运行时间差

 iQuery.GetInvLists(mapBuckets);  //所有字W集存入映射变量?nbsp;瓉所?br> iQuery.GetDocIdx(vecDocIdx);  //倒排索引存入向量?nbsp; 瓉所?br> 
 CHzSeg iHzSeg;  //include ChSeg/HzSeg.h
 iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery); //get到的查询变量分词分成 "?  ?  你们/ ?  格式"
 
 vector vecTerm;
 iQuery.ParseQuery(vecTerm);  //以"/"划分开的关键字一一序攑օ一个向量容器中
 
 set setRelevantRst;
 iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
 
 gettimeofday(&end_tv,&tz);
 // search end
 //搜烦(ch)完毕按照序我们首先深入qiQuery对象的类CQuery  

//Query.cpp

1、GetInputs

q个Ҏ(gu)的功能是前台getq来的变量{换到HtmlInputsl构体数l中如下例子和代码:(x)

//假设前台查询的关键字?1"着HtmlInputs中内容输出如?nbsp; //HtmlInputs[0].Name word  //HtmlInputs[0].Value 1  //HtmlInputs[1].Name www  //HtmlInputs[1].Value 搜烦(ch)  //HtmlInputs[2].Name cdtype  //HtmlInputs[2].Value GB

 
/*
 * Get form information throught environment varible.
 * return 0 if succeed, otherwise exit.
 */
/**
 * E序译说明
 * 处理GETq来的表?br> *
 * @access  public
 * @return  string 0
 */
int CQuery::GetInputs()
{
    int i,j;
 char *mode = getenv("REQUEST_METHOD"); //q回环境变量的?q里环境变量 REQUEST_METHOD ?get Ҏ(gu)
    char *tempstr; //GET变量字符串或POST字符串内?br> char *in_line; 
 int length;  //GET变量串长度或POST内容长度

 cout << "Content-type: text/html\n\n";
 //cout << "Cache-Control: no-cache\n";
 //cout << "Expires: Tue, 08 Apr 1997 17:20:00 GMT\n";
 //cout << "Expires: 0\n";
 //cout << "Pragma: no-cache\n\n";

 cout << "\n";
 cout << "\n";
 //cout << "\n";
 //cout << "\n";
 //cout << "\n";
 cout << "\n";
 cout.flush(); //释放输出~冲?输出头部head和之前的html标签内容
 //cout <<"" << endl;

 if (mode==NULL) return 1;

 if (strcmp(mode, "POST") == 0)
 {
  length = atoi(getenv("CONTENT_LENGTH")); //如果是POSTҎ(gu)着获得环境变量CONTENT_LENGTH的整型?br>  if (length==0 || length>=256)
   return 1;
  in_line = (char*)malloc(length + 1);
  read(STDIN_FILENO, in_line, length);
  in_line[length]='\0';
 }
 else if (strcmp(mode, "GET") == 0)
 {
  char* inputstr = getenv("QUERY_STRING"); //如果是GETҎ(gu)着获得环境变量QUERY_STRING的字W串?br>  length = strlen(inputstr);
  if (inputstr==0 || length>=256)
   return 1;

  //获取get内容长度q把get Q后面的参数赋值给变量in_line
  in_line = (char*)malloc(length + 1);
  strcpy(in_line, inputstr); //心溢出d
 }


 tempstr = (char*)malloc(length + 1); //获取post内容或get内容长度
 if(tempstr == NULL)
 {
  printf("\n");
  printf("\n");
  printf("Major failure #1;please notify the webmaster\n");
  printf("\n");
  fflush(stdout); //输出~冲?br>  exit(2); //错误q回
 }

 j=0;
 for (i=0; i char
   strcpy(HtmlInputs[HtmlInputCount].Name,tempstr);
   if (i == length - 1)
   {
    strcpy(HtmlInputs[HtmlInputCount].Value,"");
    HtmlInputCount++;
   }
   j=0;
  }
  else if ((in_line[i] == '&') || (i==length-1))
  {
   if (i==length-1)
   {
    if(in_line[i] == '+')tempstr[j]=' ';
    else tempstr[j] = in_line[i];
    j++;
   }
   tempstr[j]='\0';
   CStrFun::Translate(tempstr); //URL~码形式的参数{换成字符?%** -> char
   strcpy(HtmlInputs[HtmlInputCount].Value,tempstr);
   HtmlInputCount++;
   j=0;
  }
  else if (in_line[i] == '+')
  {
   tempstr[j]=' ';
   j++;
  }
  else
  {
   tempstr[j]=in_line[i]; //l合get中的变量如word www cdtype
   j++;
  }
  //cout<";
  //cout<";
  //cout.flush();
 }

 /*
 for (int kk = 0; kk < HtmlInputCount ; ++kk )
 {
  cout<<"Name="<";
  cout<<"Value="<";
 }
 //假设前台查询的关键字?1"输出如下
 //Name=word
 //Value=1
 //Name=www
 //Value= 搜烦(ch)
 //Name=cdtype
 //Value=GB
 */

 if(in_line) free(in_line);
 if(tempstr) free(tempstr);

 return 0;
}
 
2、SetQuery
 
//Query.cpp
void CQuery::SetQuery()
{
 string q = HtmlInputs[0].Value;
 CStrFun::Str2Lower(q,q.size()); //大写变小?br> m_sQuery = q;  //准备查询关键?br>}
3、SetStart
void CQuery::SetQuery()
{
 string q = HtmlInputs[0].Value;
 CStrFun::Str2Lower(q,q.size()); //大写变小写word变量里的?br> m_sQuery = q;  //讄查询关键?br>}

4、GetInvLists
 bool CQuery::GetInvLists(map<string, string> &mapBuckets) const
{
 ifstream ifsInvInfo(INF_INFO_NAME.c_str(), ios::binary); //以二q制形式打开一个文件的输入缓ԌINF_INFO_NAME在头文gComm.h中定义了的, const string INF_INFO_NAME("./Data/sun.iidx");
 //倒排索引文g索引字和文档好之间有一个制表符"\t"
 //朱d  14383 16151 16151 16151 1683 207 6302 7889 8218 8218 8637
 //朱古?nbsp; 1085 1222
 
 if (!ifsInvInfo) {
  cerr << "Cannot open " << INF_INFO_NAME << " for input\n";
  return false;
 }
 string strLine, strWord, strDocNum;
 //以行d输入缓冲到字符串对象strLine中ƈ做处?br> while (getline(ifsInvInfo, strLine)) {
  string::size_type idx;
  string tmp;
  idx = strLine.find("\t");
  strWord = strLine.substr(0,idx);
  strDocNum = strLine.substr(idx+1);
  mapBuckets.insert(map<string,string>::value_type (strWord, strDocNum)); //倒排表二二l表存入映射?br> 
  /*
  map<string, string>::iterator iter;
  int kkk = 0;
  for (iter = mapBuckets.begin(); kkk != 10; ++iter)
  {
   cout<<iter->first<<"  "<<iter->second<<"<br>";
   ++kkk;
  }
  cout.flush();
  */
 }
 return true;
}
 
5、GetDocIdx
 
bool CQuery::GetDocIdx(vector &vecDocIdx) const
{
 ifstream ifs(DOC_IDX_NAME.c_str(), ios::binary); 
 //0  0  bc9ce846d7987c4534f53d423380ba70
 //1  76760 4f47a3cad91f7d35f4bb6b2a638420e5
 //2  141624 d019433008538f65329ae8e39b86026c

 if (!ifs) {
  cerr << "Cannot open " << DOC_IDX_NAME << " for input\n"; //以二q制形式打开一个文件的输入缓ԌDOC_IDX_NAME在头文gComm.h中定义了的, const string INF_INFO_NAME("./Data/Doc.idx"); 
  return false;
 }

 string strLine, strDocid, strUrl;
 while (getline(ifs,strLine)){
  DocIdx di;

  sscanf( strLine.c_str(), "%d%d", &di.docid, &di.offset ); //只保留了前面两项文档号和偏移?br>  vecDocIdx.push_back(di); //导入l构体向量中
 }

 return true;
}

 



]]>
自顶向下学搜索引擎——北大天|搜索引擎TSE分析?qing)完全注释[1]L搜烦(ch)引擎入口http://www.shnenglu.com/jrckkyy/archive/2009/12/10/102939.html学者站在巨人的肩膀?/dc:creator>学者站在巨人的肩膀?/author>Thu, 10 Dec 2009 14:51:00 GMThttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102939.htmlhttp://www.shnenglu.com/jrckkyy/comments/102939.htmlhttp://www.shnenglu.com/jrckkyy/archive/2009/12/10/102939.html#Feedback0http://www.shnenglu.com/jrckkyy/comments/commentRss/102939.htmlhttp://www.shnenglu.com/jrckkyy/services/trackbacks/102939.html׃癑ֺ博客http://hi.baidu.com/jrckkyy发表文章字数有限Q以后原创文章全部都先发表到csdn和cu上,再发表到癑ֺ博客上,癑ֺ博客除了攑֎创的文章q主要放|上L到的优秀文章?/p>

本着黑客_我将陆箋把最q分析注释TSE搜烦(ch)引擎的心得发布出来,老鸟Q大虾,大牛Q高手飘q就是了Q若愿意费指点下小弟的在下不甚感激Q有问题的朋友直接留a讨论。由于本人水qx限,分析和翻译难免有错大家见W了?/p>

上学期拜MJames F.Kurose著的《计机|络-自顶向下Ҏ(gu)与internet特色(W三版阴?》,觉得写得实不错(希望没看的朋友一定要买来看看)Q自׃来搞个高自顶向下的学?fn)方法,先从用户看得到的东西出发分析研究搜?ch)引擎Q下面我们就来看看各大搜索引擎搜索界面的代码Q你所需要特别注意的是form表单中的action

雅虎http://www.yohoo.com/Q?/p>

<form name=s1 style="margin-bottom:0" action="<table cellpadding=0 cellspacing=0 border=0><tr><td>
<input type=text size=30 name=p title="enter search terms here">&nbsp;
<input type=submit value=Search>&nbsp;&nbsp;</td><td><font face=arial size=-2>·&nbsp;
<a href="
search</a><br>·&nbsp;
<a href="
popular</a></font></td></tr></table></form>
h
http://www.g.cnQ?/p>

<form method=GET action=/search><tr><td nowrap>
<font size=-1><input type=text name=q size=41 maxlength=2048 value="jrckkyy" title="Google 搜烦(ch)"> <input type=submit name=btnG value="Google 搜烦(ch)"><input type=hidden name=complete value=1><input type=hidden name=hl value="zh-CN"><input type=hidden name=newwindow value=1><input type=hidden name=sa value="2"></font></td></tr></form>
癑ֺhttp://www.baidu.comQ?/p>

<form name=f2 action="/s">
<tr valign="middle">
<td nowrap>
<input type=hidden name=ct value="0">
<input type=hidden name=ie value="gb2312">
<input type=hidden name=bs value="jrckkyy">
<input type=hidden name=sr>
<input type=hidden name=z value="">
<input type=hidden name=cl value=3>
<input type=hidden name=f value=8>
<input name=wd size="35" class=i value="jrckkyy" maxlength=100>
<input type=submit value=癑ֺ一?gt; <input type=button value=l果中找 onclick="return bq(f2,1,0);">&nbsp;&nbsp;&nbsp;</td>
<td nowrap><a href="</tr>
</form>
天网
http://www.tianwang.com/Q?/p>

<form name=f action="/cgi-bin/tw" method=get>
                <td valign=center width=634 background=images/index_image_02.gif>
                    <table height=46 cellspacing=0 cellpadding=0 width=600 align=right  border=0>
                        <tbody>
                            <tr>
                                <td height=50>
                                    <table cellspacing=0 cellpadding=0 width=600 border=0>
                                        <tbody>
                                            <tr>
                                  <td width="524" height="30" valign="bottom">
                                        <div align="center">                                  <input name="word" type="text" size="40" maxlength="255" onClick="this.focus();checkWord(this,1)" onblutesr='checkWord(this,0)' value='误入资源名U?>
                                            <font color=#ffffff> &nbsp;
                                            <select onChange=reRange(this.selectedIndex) name=range>
                                                <script language=javascript>...
                           <!--
                           for(var i = 0; i < rescode.length; i++) ...{
                               if(i == 0) ...{
                                   document.write('<option value="0" selected>' + rescode[i][0] + '</option>');
                               } else ...{
                                   document.write('<option value="' + i + '">' + rescode[i][0] + '</option>');
                               }
                           }
                           document.f.range.selectedIndex = 0;
                           -->
                         </script>
                                            </select>
                                            </font>-<font color=#ffffff>
                                            <select name=cd>
                                                <script language=javascript>...
                           <!--
                           var ind = document.f.range.selectedIndex;
                           var len = (rescode[ind].length - 1) / 2;
                           var sel = 0;
                           for(var i = 0; i < len; i++) ...{
                               document.write('<option value="' + rescode[ind][2*i+1] + '">' + rescode[ind][2*i+2] + '</option>');
                               if(rescode[ind][2*i+1] == 0)
                                   sel = i;
                           }
                           document.f.cd.selectedIndex = sel;
                           -->
                 </script>
                                            </select>
                                            </font></div>
                                    </td>
                <td width="71" valign="bottom"><input id=submit2 type=image height=22 width=40 src="images/so2.gif" align=absMiddle name=submit></td>
              </tr>
                                            <tr>
                                                <td colspan=3 height=25 class=style16>
                                                    <div align=center></div>
                                                </td>
                                            </tr>
                                        </tbody>
                                    </table>
                                </td>
                            </tr>
                        </tbody>
                    </table>
                </td>
            </form>
试服务器TSEQ?/p>

<form method="get" action="/cgi-bin/index/TSESearch" name="tw">
        <td width="100%" height="25" align="center">                          
        <input type="text" name="word" size="55">
        <input type="submit" value=" 搜烦(ch)" name="www">
        </td>                          
        <input type="hidden" name="cdtype" value="GB">                        
        </form>   
׃上几个form的属性可以看出全部采用的是getҎ(gu)QCGI做ؓ(f)处理E序Q也是C/C++QCGI全称?#8220;公共|关界面”(Common Gateway Interface)QHTTP服务器与你的或其它机器上的程序进?#8220;交谈”的一U工P其程序须q行在网l服务器上。CGI逐渐被近几年来的PHPQJAVAQASPQPERLQPythonQRuby{动态语a所取代。但是其在速度和运行效率上的优势是无法取代的?/p>

以下是TSE CGI入口E序注释Q其他搜索引擎的入口也应该类?/p>

 

/**//**
 * E序译说明
 * @Copyright (c) 2008, 研发?br> * All rights reserved.
 *
 * @filesource  TSESearch.cpp
 * @author  jrckkyy <jrckkyy@163.com>
 *
 * Let's start
 *
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <unistd.h>

#include <iostream>
#include <fstream>
#include <list>

#include "Comm.h"    //包含2个烦(ch)引和1个数据文?br>#include "Query.h"    //包含数据查询处理头文?br>#include "Document.h"    //html文档处理头文?br>#include "StrFun.h"        //字符串处理头文g
#include "ChSeg/Dict.h"    //字元字典处理头文?br>#include "ChSeg/HzSeg.h"   
#include "DisplayRst.h"    //q回查询l果面头文Ӟq回l果分ؓ(f)头部Q中部,底部

using namespace std;

/**//*
 * A inverted file(INF) includes a term-index file & a inverted-lists file.
 * A inverted-lists consists of many bucks(posting lists).
 * The term-index file is stored at vecTerm, and
 * the inverted-lists is sored at mapBuckets.
 */

/**//**
 * E序译说明
 * 搜烦(ch)E序入口前台关键字提交到该cgiE序 例如Q?/cgi-bin/index/TSESearch?word=123&start=1
 * 倒排文g包括一个记录检索词文g和一个倒排列表文g?br> * 倒排列表包含很多标志Q提交名单)?br> * 记录(g)索词文g使用vecTerm来排序,和倒排列表是用mapBuckets来排序?br> *
 * @access  public
 * @param   int char 参数的汉字说?用于接收前台get传递的参数
 * @return  string 0
 */
int main(int argc, char* argv[])
...{
    struct timeval begin_tv, end_tv;
    struct timezone tz;

    CDict iDict;
    map<string, string> dictMap, mapBuckets;
    vector<DocIdx> vecDocIdx;    //Document。h

    CQuery iQuery;
    iQuery.GetInputs();        //具体E序开始执?br>    // current query & result page number
    iQuery.SetQuery();
    iQuery.SetStart();

    // begin to search
    //开始具体搜索程?br>    gettimeofday(&begin_tv,&tz);    //开始计时获取程序运行时间差

    iQuery.GetInvLists(mapBuckets);        //所有字W集存入映射变量?nbsp;   瓉所?br>    iQuery.GetDocIdx(vecDocIdx);        //倒排索引存入向量?nbsp;       瓉所?br>   
    CHzSeg iHzSeg;        //include ChSeg/HzSeg.h
    iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery);    //get到的查询变量分词分成 "?        ?        你们/    ?        格式"
   
    vector<string> vecTerm;
    iQuery.ParseQuery(vecTerm);        //以"/"划分开的关键字一一序攑օ一个向量容器中
   
    set<string> setRelevantRst;
    iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst);
   
    gettimeofday(&end_tv,&tz);
    // search end
    //搜烦(ch)完毕

    //下面开始显C?br>    CDisplayRst iDisplayRst;
    iDisplayRst.ShowTop();

    float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000
        +((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000;

    iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec,
            setRelevantRst.size(), iQuery.m_iStart);

    iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart);

    return 0;

}

 

 



]]>
޾Ʒþһ| 99þùۺϾƷ鶹| ޾ƷƵþþ| Ļþһ| vĻþ | ȫɫƴɫƬѾþþ| þþƷƷ| þ߳ˮ| VVþþ| ˾Ʒһþ| ˾Ʒþþþ7777| 99þ99þþƷѿ| AëƬþþþƷëƬ| ŷպƷþ| ޹þþþƷС˵ | ձƷþþþӰԺձ| AVպƷþþþþþ | ƷŮٸAVѾþ| 99þ뾫Ʒϵ| ŷҹͽþþ| ɫվþþþۺywww| ƷþþþþþþþĻ | þþƷŷƬ| AVþþƷ| 99þҹɫƷվ| һһþaaۺϾƷ| þþþùƵӰ| þ99Ʒþþþ| ޾Ʒþò| ޹˾þۺ3d| þþһƷ99þþƷ88| ޾Ʒþþþþ| þѹƵ| ޾ƷNVþþþþþþþ | ŷ㽶þۺվ| þAV| þùƷҰAV| þþƷž޾Ʒ| ˾þþƷһ| 뾫Ʒþþþ| Ʒξþþþ99վ|